From 7974c3cfeabe20ee3aff0323b88db8469049fe2b Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Sat, 23 Mar 2024 20:59:29 +0100 Subject: [PATCH 1/6] Reuse vec for next sections Since in binary search we need random access to the next sections, we have to allocate a Vec at some point. Rather do this for every chunk, this now reuses the same vec so we can reuse the allocated memory as often as possible. --- src/lib.rs | 71 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 5e1db3d..9493f1d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -279,6 +279,8 @@ where chunk_sizer: &'sizer S, /// Current byte offset in the `text` cursor: usize, + /// Reusable container for next sections to avoid extra allocations + next_sections: Vec<(usize, &'text str)>, /// Splitter used for determining semantic levels. semantic_split: Sp, /// Original text to iterate over and generate chunks from @@ -300,6 +302,7 @@ where cursor: 0, chunk_capacity, chunk_sizer, + next_sections: Vec::new(), semantic_split: Sp::new(text), text, trim_chunks, @@ -333,18 +336,19 @@ where let mut end = self.cursor; let mut equals_found = false; - let sections = self.next_sections()?.collect::>(); - let mut sizes = sections + self.update_next_sections(); + let mut sizes = self + .next_sections .iter() .map(|_| None) .collect::>>(); let mut low = 0; - let mut high = sections.len().saturating_sub(1); + let mut high = self.next_sections.len().saturating_sub(1); let mut successful_index = None; while low <= high { let mid = low + (high - low) / 2; - let (offset, str) = sections[mid]; + let (offset, str) = self.next_sections[mid]; let text_end = offset + str.len(); let chunk = self.text.get(start..text_end)?; let chunk_size = self.check_capacity(start, chunk); @@ -392,7 +396,11 @@ where Some((successful_index, sizes.get(successful_index)?.as_ref()?)) }) { - for (size, (offset, str)) in sizes.iter().zip(sections).skip(successful_index) { + for (size, (offset, str)) in sizes + .iter() + .zip(self.next_sections.iter()) + .skip(successful_index) + { let text_end = offset + str.len(); match size { Some(size) if size.size <= chunk_size.size => { @@ -428,21 +436,33 @@ where /// Find the ideal next sections, breaking it up until we find the largest chunk. /// Increasing length of chunk until we find biggest size to minimize validation time /// on huge chunks - fn next_sections(&'sizer self) -> Option + 'sizer> { + fn update_next_sections(&mut self) { + // First thing, clear out the list, but reuse the allocated memory + self.next_sections.clear(); // Next levels to try. Will stop at max level. We check only levels in the next max level // chunk so we don't bypass it if not all levels are present in every chunk. let mut levels = self.semantic_split.levels_in_remaining_text(self.cursor); // Get starting level - let mut semantic_level = levels.next()?; + let Some(mut semantic_level) = levels.next() else { + return; + }; // If we aren't at the highest semantic level, stop iterating sections that go beyond the range of the next level. let mut max_encoded_offset = None; + let remaining_text = self.text.get(self.cursor..).unwrap(); + for level in levels { - let (_, str) = self.semantic_chunks(level).next()?; + let Some((_, str)) = self + .semantic_split + .semantic_chunks(self.cursor, remaining_text, level) + .next() + else { + return; + }; let chunk_size = self.check_capacity(self.cursor, str); // If this no longer fits, we use the level we are at. Or if we already // have the rest of the string - if chunk_size.fits.is_gt() || self.text.get(self.cursor..)? == str { + if chunk_size.fits.is_gt() || remaining_text == str { max_encoded_offset = chunk_size.max_chunk_size_offset; break; } @@ -450,27 +470,18 @@ where semantic_level = level; } - Some( - self.semantic_chunks(semantic_level) - // We don't want to return items at this level that go beyond the next highest semantic level, as that is most - // likely a meaningful breakpoint we want to preserve. We already know that the next highest doesn't fit anyway, - // so we should be safe to break once we reach it. - .take_while_inclusive(move |(offset, _)| { - max_encoded_offset.map_or(true, |max| offset <= &max) - }) - .filter(|(_, str)| !str.is_empty()), - ) - } - - fn semantic_chunks( - &'sizer self, - level: ::Level, - ) -> impl Iterator + 'sizer { - self.semantic_split.semantic_chunks( - self.cursor, - self.text.get(self.cursor..).unwrap(), - level, - ) + let sections = self + .semantic_split + .semantic_chunks(self.cursor, remaining_text, semantic_level) + // We don't want to return items at this level that go beyond the next highest semantic level, as that is most + // likely a meaningful breakpoint we want to preserve. We already know that the next highest doesn't fit anyway, + // so we should be safe to break once we reach it. + .take_while_inclusive(move |(offset, _)| { + max_encoded_offset.map_or(true, |max| offset <= &max) + }) + .filter(|(_, str)| !str.is_empty()); + + self.next_sections.extend(sections); } } From 12c0040ed756c1750445e7085cb404e146a49e57 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Sun, 24 Mar 2024 06:39:15 +0100 Subject: [PATCH 2/6] Reduce allocations in chunk generation Does two things: 1. Memoizes the output of `chunk_size`, since this can get called several times on the same chunk in the course of selecting a chunk. 2. Since we are doing this, we no longer need the `sizes` array which tried to do the same thing. 3. Levels in the next semantic chunks now also reuse an allocation. This isn't ideal because it didn't allocate at all before, but was necessary to allow a mutable reference. This does however set things up to also do binary search on the levels. --- Cargo.toml | 1 + benches/output.txt | 580 ++++++++++++++++++------------------- bindings/python/Cargo.lock | 34 +++ src/lib.rs | 182 +++++++++--- 4 files changed, 465 insertions(+), 332 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 596b7db..e2315af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ rustdoc-args = ["--cfg", "docsrs"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +ahash = "0.8.11" auto_enums = "0.8.5" either = "1.10.0" itertools = "0.12.1" diff --git a/benches/output.txt b/benches/output.txt index 6bf8a61..46c2d47 100644 --- a/benches/output.txt +++ b/benches/output.txt @@ -1,65 +1,65 @@ -running 67 tests -iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii -test result: ok. 0 passed; 0 failed; 67 ignored; 0 measured; 0 filtered out; finished in 0.00s +running 70 tests +iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii +test result: ok. 0 passed; 0 failed; 70 ignored; 0 measured; 0 filtered out; finished in 0.00s chunk_size fastest │ slowest │ median │ mean │ samples │ iters ├─ markdown │ │ │ │ │ │ ├─ characters │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 331.1 ms │ 363 ms │ 332.2 ms │ 332.6 ms │ 100 │ 100 -│ │ │ 619.1 KB/s │ 564.7 KB/s │ 617.1 KB/s │ 616.3 KB/s │ │ +│ │ │ ╰─ commonmark_spec 347.8 ms │ 381.3 ms │ 351.7 ms │ 355.2 ms │ 100 │ 100 +│ │ │ 589.4 KB/s │ 537.5 KB/s │ 582.8 KB/s │ 577.2 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 22093 │ 0 │ 22093 │ 21872 │ │ -│ │ │ 77.31 MB │ 0 B │ 77.31 MB │ 76.54 MB │ │ +│ │ │ 13270 │ 13270 │ 13270 │ 13137 │ │ +│ │ │ 75.99 MB │ 75.99 MB │ 75.99 MB │ 75.23 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 22094 │ 0 │ 22094 │ 21873 │ │ -│ │ │ 283.2 MB │ 0 B │ 283.2 MB │ 280.3 MB │ │ +│ │ │ 13271 │ 13271 │ 13271 │ 13138 │ │ +│ │ │ 281.4 MB │ 281.4 MB │ 281.4 MB │ 278.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 47400 │ 0 │ 47400 │ 46926 │ │ -│ │ │ 205.7 MB │ 0 B │ 205.7 MB │ 203.6 MB │ │ +│ │ │ 44735 │ 44735 │ 44735 │ 44287 │ │ +│ │ │ 205.2 MB │ 205.2 MB │ 205.2 MB │ 203.2 MB │ │ │ │ │ shrink: │ │ │ │ │ -│ │ │ 13 │ 0 │ 13 │ 12.87 │ │ -│ │ │ 94 B │ 0 B │ 94 B │ 93.06 B │ │ +│ │ │ 13 │ 13 │ 13 │ 12.87 │ │ +│ │ │ 94 B │ 94 B │ 94 B │ 93.06 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 46.24 ms │ 47.33 ms │ 46.48 ms │ 46.49 ms │ 100 │ 100 -│ │ │ 4.433 MB/s │ 4.331 MB/s │ 4.41 MB/s │ 4.41 MB/s │ │ +│ │ │ ╰─ commonmark_spec 47.33 ms │ 48.23 ms │ 47.57 ms │ 47.59 ms │ 100 │ 100 +│ │ │ 4.331 MB/s │ 4.25 MB/s │ 4.309 MB/s │ 4.307 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 2599 │ 2599 │ 2599 │ 2599 │ │ -│ │ │ 9.381 MB │ 9.381 MB │ 9.381 MB │ 9.381 MB │ │ +│ │ │ 1576 │ 1576 │ 1576 │ 1576 │ │ +│ │ │ 9.241 MB │ 9.241 MB │ 9.241 MB │ 9.241 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 2600 │ 2600 │ 2600 │ 2600 │ │ -│ │ │ 34.7 MB │ 34.7 MB │ 34.7 MB │ 34.7 MB │ │ +│ │ │ 1577 │ 1577 │ 1577 │ 1577 │ │ +│ │ │ 34.52 MB │ 34.52 MB │ 34.52 MB │ 34.52 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 5607 │ 5607 │ 5607 │ 5607 │ │ -│ │ │ 25.12 MB │ 25.12 MB │ 25.12 MB │ 25.12 MB │ │ +│ │ │ 5255 │ 5255 │ 5255 │ 5255 │ │ +│ │ │ 25.08 MB │ 25.08 MB │ 25.08 MB │ 25.08 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 8.622 ms │ 8.967 ms │ 8.651 ms │ 8.666 ms │ 100 │ 100 -│ │ │ 23.77 MB/s │ 22.86 MB/s │ 23.69 MB/s │ 23.65 MB/s │ │ +│ │ │ ╰─ commonmark_spec 8.677 ms │ 9.057 ms │ 8.77 ms │ 8.771 ms │ 100 │ 100 +│ │ │ 23.62 MB/s │ 22.63 MB/s │ 23.37 MB/s │ 23.37 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 406 │ 406 │ 406 │ 406 │ │ -│ │ │ 1.701 MB │ 1.701 MB │ 1.701 MB │ 1.701 MB │ │ +│ │ │ 261 │ 261 │ 261 │ 261 │ │ +│ │ │ 1.663 MB │ 1.663 MB │ 1.663 MB │ 1.663 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 407 │ 407 │ 407 │ 407 │ │ -│ │ │ 6.409 MB │ 6.409 MB │ 6.409 MB │ 6.409 MB │ │ +│ │ │ 262 │ 262 │ 262 │ 262 │ │ +│ │ │ 6.346 MB │ 6.346 MB │ 6.346 MB │ 6.346 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 901 │ 901 │ 901 │ 901 │ │ -│ │ │ 4.503 MB │ 4.503 MB │ 4.503 MB │ 4.503 MB │ │ +│ │ │ 814 │ 814 │ 814 │ 814 │ │ +│ │ │ 4.478 MB │ 4.478 MB │ 4.478 MB │ 4.478 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 2.086 ms │ 2.303 ms │ 2.099 ms │ 2.111 ms │ 100 │ 100 -│ │ 98.26 MB/s │ 89.02 MB/s │ 97.67 MB/s │ 97.09 MB/s │ │ +│ │ ╰─ commonmark_spec 2.023 ms │ 2.209 ms │ 2.087 ms │ 2.077 ms │ 100 │ 100 +│ │ 101.3 MB/s │ 92.8 MB/s │ 98.21 MB/s │ 98.69 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 81 │ 81 │ 81 │ 81 │ │ -│ │ 529.3 KB │ 529.3 KB │ 529.3 KB │ 529.3 KB │ │ +│ │ 66 │ 66 │ 66 │ 66 │ │ +│ │ 528.4 KB │ 528.4 KB │ 528.4 KB │ 528.4 KB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 82 │ 82 │ 82 │ 82 │ │ -│ │ 2.123 MB │ 2.123 MB │ 2.123 MB │ 2.123 MB │ │ +│ │ 67 │ 67 │ 67 │ 67 │ │ +│ │ 2.122 MB │ 2.122 MB │ 2.122 MB │ 2.122 MB │ │ │ │ grow: │ │ │ │ │ │ │ 150 │ 150 │ 150 │ 150 │ │ │ │ 1.388 MB │ 1.388 MB │ 1.388 MB │ 1.388 MB │ │ @@ -68,407 +68,407 @@ chunk_size fastest │ slowest │ median │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ tiktoken │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 867.2 ms │ 955.8 ms │ 923.7 ms │ 909.9 ms │ 100 │ 100 -│ │ │ 236.4 KB/s │ 214.4 KB/s │ 221.9 KB/s │ 225.3 KB/s │ │ +│ │ │ ╰─ commonmark_spec 867.9 ms │ 899.5 ms │ 874.4 ms │ 874.7 ms │ 100 │ 100 +│ │ │ 236.2 KB/s │ 227.9 KB/s │ 234.4 KB/s │ 234.3 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 8672718 │ 8672718 │ 8672718 │ 8672718 │ │ -│ │ │ 420.8 MB │ 420.8 MB │ 420.8 MB │ 420.8 MB │ │ +│ │ │ 8103002 │ 8103002 │ 8103002 │ 8103002 │ │ +│ │ │ 394.3 MB │ 394.3 MB │ 394.3 MB │ 394.3 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 8984226 │ 8984226 │ 8984226 │ 8984226 │ │ -│ │ │ 789.1 MB │ 789.1 MB │ 789.1 MB │ 789.1 MB │ │ +│ │ │ 8414510 │ 8414510 │ 8414510 │ 8414510 │ │ +│ │ │ 745 MB │ 745 MB │ 745 MB │ 745 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 1576174 │ 1576174 │ 1576174 │ 1576174 │ │ -│ │ │ 349.7 MB │ 349.7 MB │ 349.7 MB │ 349.7 MB │ │ +│ │ │ 1466110 │ 1466110 │ 1466110 │ 1466110 │ │ +│ │ │ 332.1 MB │ 332.1 MB │ 332.1 MB │ 332.1 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 304.4 ms │ 314.1 ms │ 309.9 ms │ 309.4 ms │ 100 │ 100 -│ │ │ 673.4 KB/s │ 652.7 KB/s │ 661.4 KB/s │ 662.6 KB/s │ │ +│ │ │ ╰─ commonmark_spec 288.3 ms │ 294.6 ms │ 289.8 ms │ 290.3 ms │ 100 │ 100 +│ │ │ 711 KB/s │ 695.7 KB/s │ 707.2 KB/s │ 706 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 3289368 │ 3289368 │ 3289368 │ 3289368 │ │ -│ │ │ 154 MB │ 154 MB │ 154 MB │ 154 MB │ │ +│ │ │ 2947758 │ 2947758 │ 2947758 │ 2947758 │ │ +│ │ │ 138.2 MB │ 138.2 MB │ 138.2 MB │ 138.2 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 3600876 │ 3600876 │ 3600876 │ 3600876 │ │ -│ │ │ 286.6 MB │ 286.6 MB │ 286.6 MB │ 286.6 MB │ │ +│ │ │ 3259266 │ 3259266 │ 3259266 │ 3259266 │ │ +│ │ │ 260.4 MB │ 260.4 MB │ 260.4 MB │ 260.4 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 615911 │ 615911 │ 615911 │ 615911 │ │ -│ │ │ 114.1 MB │ 114.1 MB │ 114.1 MB │ 114.1 MB │ │ +│ │ │ 549666 │ 549666 │ 549666 │ 549666 │ │ +│ │ │ 103.6 MB │ 103.6 MB │ 103.6 MB │ 103.6 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 189 ms │ 196.6 ms │ 189.9 ms │ 190.3 ms │ 100 │ 100 -│ │ │ 1.084 MB/s │ 1.042 MB/s │ 1.079 MB/s │ 1.076 MB/s │ │ +│ │ │ ╰─ commonmark_spec 157.6 ms │ 166.5 ms │ 160.8 ms │ 160.9 ms │ 100 │ 100 +│ │ │ 1.3 MB/s │ 1.231 MB/s │ 1.274 MB/s │ 1.274 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 2065547 │ 2065547 │ 2065547 │ 2065547 │ │ -│ │ │ 95.46 MB │ 95.46 MB │ 95.46 MB │ 95.46 MB │ │ +│ │ │ 1658609 │ 1658609 │ 1658609 │ 1658609 │ │ +│ │ │ 76.82 MB │ 76.82 MB │ 76.82 MB │ 76.82 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 2377055 │ 2377055 │ 2377055 │ 2377055 │ │ -│ │ │ 182.4 MB │ 182.4 MB │ 182.4 MB │ 182.4 MB │ │ +│ │ │ 1970117 │ 1970117 │ 1970117 │ 1970117 │ │ +│ │ │ 151.1 MB │ 151.1 MB │ 151.1 MB │ 151.1 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 384264 │ 384264 │ 384264 │ 384264 │ │ -│ │ │ 68.42 MB │ 68.42 MB │ 68.42 MB │ 68.42 MB │ │ +│ │ │ 309881 │ 309881 │ 309881 │ 309881 │ │ +│ │ │ 55.82 MB │ 55.82 MB │ 55.82 MB │ 55.82 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 82.27 ms │ 84.55 ms │ 82.57 ms │ 82.7 ms │ 100 │ 100 -│ │ 2.491 MB/s │ 2.424 MB/s │ 2.482 MB/s │ 2.479 MB/s │ │ +│ │ ╰─ commonmark_spec 73.17 ms │ 76.31 ms │ 74.15 ms │ 74.43 ms │ 100 │ 100 +│ │ 2.801 MB/s │ 2.686 MB/s │ 2.764 MB/s │ 2.754 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 878315 │ 878315 │ 878315 │ 878315 │ │ -│ │ 40.83 MB │ 40.83 MB │ 40.83 MB │ 40.83 MB │ │ +│ │ 750072 │ 750072 │ 750072 │ 750072 │ │ +│ │ 34.96 MB │ 34.96 MB │ 34.96 MB │ 34.96 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 1189823 │ 1189823 │ 1189823 │ 1189823 │ │ -│ │ 88.71 MB │ 88.71 MB │ 88.71 MB │ 88.71 MB │ │ +│ │ 1061580 │ 1061580 │ 1061580 │ 1061580 │ │ +│ │ 78.87 MB │ 78.87 MB │ 78.87 MB │ 78.87 MB │ │ │ │ grow: │ │ │ │ │ -│ │ 165295 │ 165295 │ 165295 │ 165295 │ │ -│ │ 29.35 MB │ 29.35 MB │ 29.35 MB │ 29.35 MB │ │ +│ │ 141697 │ 141697 │ 141697 │ 141697 │ │ +│ │ 25.39 MB │ 25.39 MB │ 25.39 MB │ 25.39 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ tokenizers │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ ╰─ commonmark_spec 1.609 s │ 1.769 s │ 1.631 s │ 1.637 s │ 100 │ 100 -│ │ 127.4 KB/s │ 115.8 KB/s │ 125.6 KB/s │ 125.2 KB/s │ │ +│ │ ╰─ commonmark_spec 1.57 s │ 1.665 s │ 1.59 s │ 1.59 s │ 100 │ 100 +│ │ 130.5 KB/s │ 123 KB/s │ 128.8 KB/s │ 128.8 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 41344053 │ 41344053 │ 41344053 │ 41344053 │ │ -│ │ 3.939 GB │ 3.939 GB │ 3.939 GB │ 3.939 GB │ │ +│ │ 39101785 │ 39101785 │ 39101785 │ 39101785 │ │ +│ │ 3.731 GB │ 3.731 GB │ 3.731 GB │ 3.731 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 41402106 │ 41402106 │ 41402106 │ 41402106 │ │ -│ │ 6.152 GB │ 6.152 GB │ 6.152 GB │ 6.152 GB │ │ +│ │ 39159838 │ 39159838 │ 39159838 │ 39159838 │ │ +│ │ 5.853 GB │ 5.853 GB │ 5.853 GB │ 5.853 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 1566195 │ 1566195 │ 1566195 │ 1566195 │ │ -│ │ 2.208 GB │ 2.208 GB │ 2.208 GB │ 2.208 GB │ │ +│ │ 1329521 │ 1329521 │ 1329521 │ 1329521 │ │ +│ │ 2.117 GB │ 2.117 GB │ 2.117 GB │ 2.117 GB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 512 │ │ │ │ │ -│ │ ╰─ commonmark_spec 674 ms │ 705.3 ms │ 685.2 ms │ 685.4 ms │ 100 │ 100 -│ │ 304.1 KB/s │ 290.6 KB/s │ 299.1 KB/s │ 299 KB/s │ │ +│ │ ╰─ commonmark_spec 627.5 ms │ 668.4 ms │ 647 ms │ 645.6 ms │ 100 │ 100 +│ │ 326.7 KB/s │ 306.7 KB/s │ 316.8 KB/s │ 317.5 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 17769736 │ 17769736 │ 17769736 │ 17769736 │ │ -│ │ 1.747 GB │ 1.747 GB │ 1.747 GB │ 1.747 GB │ │ +│ │ 16074029 │ 16074029 │ 16074029 │ 16074029 │ │ +│ │ 1.579 GB │ 1.579 GB │ 1.579 GB │ 1.579 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 17827789 │ 17827789 │ 17827789 │ 17827789 │ │ -│ │ 2.691 GB │ 2.691 GB │ 2.691 GB │ 2.691 GB │ │ +│ │ 16132082 │ 16132082 │ 16132082 │ 16132082 │ │ +│ │ 2.438 GB │ 2.438 GB │ 2.438 GB │ 2.438 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 479024 │ 479024 │ 479024 │ 479024 │ │ -│ │ 939.5 MB │ 939.5 MB │ 939.5 MB │ 939.5 MB │ │ +│ │ 392881 │ 392881 │ 392881 │ 392881 │ │ +│ │ 854.6 MB │ 854.6 MB │ 854.6 MB │ 854.6 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 4096 │ │ │ │ │ -│ │ ╰─ commonmark_spec 400.8 ms │ 458 ms │ 402.7 ms │ 409.6 ms │ 100 │ 100 -│ │ 511.4 KB/s │ 447.5 KB/s │ 509 KB/s │ 500.4 KB/s │ │ +│ │ ╰─ commonmark_spec 336.7 ms │ 342.1 ms │ 338.2 ms │ 338.3 ms │ 100 │ 100 +│ │ 608.8 KB/s │ 599.2 KB/s │ 606.2 KB/s │ 605.9 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 10615708 │ 10615708 │ 10615708 │ 10615708 │ │ -│ │ 1.053 GB │ 1.053 GB │ 1.053 GB │ 1.053 GB │ │ +│ │ 8511827 │ 8511827 │ 8511827 │ 8511827 │ │ +│ │ 845.9 MB │ 845.9 MB │ 845.9 MB │ 845.9 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 10673761 │ 10673761 │ 10673761 │ 10673761 │ │ -│ │ 1.61 GB │ 1.61 GB │ 1.61 GB │ 1.61 GB │ │ +│ │ 8569880 │ 8569880 │ 8569880 │ 8569880 │ │ +│ │ 1.296 GB │ 1.296 GB │ 1.296 GB │ 1.296 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 217813 │ 217813 │ 217813 │ 217813 │ │ -│ │ 551.7 MB │ 551.7 MB │ 551.7 MB │ 551.7 MB │ │ +│ │ 168315 │ 168315 │ 168315 │ 168315 │ │ +│ │ 445.2 MB │ 445.2 MB │ 445.2 MB │ 445.2 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ 32768 │ │ │ │ │ -│ ╰─ commonmark_spec 196.7 ms │ 221.7 ms │ 203 ms │ 203.9 ms │ 100 │ 100 -│ 1.041 MB/s │ 924.3 KB/s │ 1.009 MB/s │ 1.005 MB/s │ │ +│ ╰─ commonmark_spec 180.4 ms │ 194 ms │ 188.1 ms │ 188 ms │ 100 │ 100 +│ 1.135 MB/s │ 1.056 MB/s │ 1.089 MB/s │ 1.09 MB/s │ │ │ alloc: │ │ │ │ │ -│ 5204030 │ 5204030 │ 5204030 │ 5204030 │ │ -│ 522.4 MB │ 522.4 MB │ 522.4 MB │ 522.4 MB │ │ +│ 4579920 │ 4579920 │ 4579920 │ 4579920 │ │ +│ 460.2 MB │ 460.2 MB │ 460.2 MB │ 460.2 MB │ │ │ dealloc: │ │ │ │ │ -│ 5262083 │ 5262083 │ 5262083 │ 5262083 │ │ -│ 793.2 MB │ 793.2 MB │ 793.2 MB │ 793.2 MB │ │ +│ 4637973 │ 4637973 │ 4637973 │ 4637973 │ │ +│ 698.6 MB │ 698.6 MB │ 698.6 MB │ 698.6 MB │ │ │ grow: │ │ │ │ │ -│ 91803 │ 91803 │ 91803 │ 91803 │ │ -│ 265.8 MB │ 265.8 MB │ 265.8 MB │ 265.8 MB │ │ +│ 79600 │ 79600 │ 79600 │ 79600 │ │ +│ 233.4 MB │ 233.4 MB │ 233.4 MB │ 233.4 MB │ │ │ shrink: │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ ╰─ text │ │ │ │ │ ├─ characters │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 213 ms │ 218.8 ms │ 214.4 ms │ 214.6 ms │ 100 │ 100 - │ │ │ 768 KB/s │ 747.6 KB/s │ 763 KB/s │ 762.2 KB/s │ │ + │ │ ├─ romeo_and_juliet 207.4 ms │ 209.3 ms │ 207.8 ms │ 207.9 ms │ 100 │ 100 + │ │ │ 788.8 KB/s │ 781.6 KB/s │ 787.1 KB/s │ 786.7 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 18663 │ 18939 │ 18663 │ 18665 │ │ - │ │ │ 33.41 MB │ 33.44 MB │ 33.41 MB │ 33.41 MB │ │ + │ │ │ 11188 │ 11464 │ 11188 │ 11190 │ │ + │ │ │ 32.32 MB │ 32.34 MB │ 32.32 MB │ 32.32 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 18664 │ 18860 │ 18664 │ 18665 │ │ - │ │ │ 123.2 MB │ 123.2 MB │ 123.2 MB │ 123.2 MB │ │ + │ │ │ 11189 │ 11385 │ 11189 │ 11190 │ │ + │ │ │ 121.8 MB │ 121.8 MB │ 121.8 MB │ 121.8 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 34910 │ 34949 │ 34910 │ 34910 │ │ - │ │ │ 89.66 MB │ 89.67 MB │ 89.66 MB │ 89.66 MB │ │ + │ │ │ 33449 │ 33488 │ 33449 │ 33449 │ │ + │ │ │ 89.36 MB │ 89.37 MB │ 89.36 MB │ 89.36 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 0 │ 5 │ 0 │ 0.05 │ │ │ │ │ 0 B │ 2.34 KB │ 0 B │ 23.4 B │ │ - │ │ ╰─ room_with_a_view 161.4 ms │ 166.1 ms │ 161.8 ms │ 162.7 ms │ 100 │ 100 - │ │ 1.869 MB/s │ 1.817 MB/s │ 1.865 MB/s │ 1.855 MB/s │ │ + │ │ ╰─ room_with_a_view 163.1 ms │ 164.1 ms │ 163.4 ms │ 163.4 ms │ 100 │ 100 + │ │ 1.85 MB/s │ 1.839 MB/s │ 1.846 MB/s │ 1.846 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 30805 │ 30805 │ 30805 │ 30805 │ │ - │ │ 29.33 MB │ 29.33 MB │ 29.33 MB │ 29.33 MB │ │ + │ │ 18430 │ 18430 │ 18430 │ 18430 │ │ + │ │ 26.32 MB │ 26.32 MB │ 26.32 MB │ 26.32 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 30806 │ 30806 │ 30806 │ 30806 │ │ - │ │ 97.49 MB │ 97.49 MB │ 97.49 MB │ 97.49 MB │ │ + │ │ 18431 │ 18431 │ 18431 │ 18431 │ │ + │ │ 92.81 MB │ 92.81 MB │ 92.81 MB │ 92.81 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 56128 │ 56128 │ 56128 │ 56128 │ │ - │ │ 67.85 MB │ 67.85 MB │ 67.85 MB │ 67.85 MB │ │ + │ │ 48815 │ 48815 │ 48815 │ 48815 │ │ + │ │ 66.19 MB │ 66.19 MB │ 66.19 MB │ 66.19 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 24.93 ms │ 25.83 ms │ 25.02 ms │ 25.04 ms │ 100 │ 100 - │ │ │ 6.561 MB/s │ 6.332 MB/s │ 6.538 MB/s │ 6.531 MB/s │ │ + │ │ ├─ romeo_and_juliet 24.72 ms │ 25.54 ms │ 24.87 ms │ 24.87 ms │ 100 │ 100 + │ │ │ 6.616 MB/s │ 6.405 MB/s │ 6.578 MB/s │ 6.577 MB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 1995 │ 1995 │ 1995 │ 1995 │ │ - │ │ │ 3.712 MB │ 3.712 MB │ 3.712 MB │ 3.712 MB │ │ + │ │ │ 1200 │ 1200 │ 1200 │ 1200 │ │ + │ │ │ 3.479 MB │ 3.479 MB │ 3.479 MB │ 3.479 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 1996 │ 1996 │ 1996 │ 1996 │ │ - │ │ │ 13.97 MB │ 13.97 MB │ 13.97 MB │ 13.97 MB │ │ + │ │ │ 1201 │ 1201 │ 1201 │ 1201 │ │ + │ │ │ 13.58 MB │ 13.58 MB │ 13.58 MB │ 13.58 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 4318 │ 4318 │ 4318 │ 4318 │ │ - │ │ │ 10.09 MB │ 10.09 MB │ 10.09 MB │ 10.09 MB │ │ - │ │ ╰─ room_with_a_view 25.41 ms │ 26.28 ms │ 25.51 ms │ 25.53 ms │ 100 │ 100 - │ │ 11.87 MB/s │ 11.48 MB/s │ 11.83 MB/s │ 11.82 MB/s │ │ + │ │ │ 3595 │ 3595 │ 3595 │ 3595 │ │ + │ │ │ 9.941 MB │ 9.941 MB │ 9.941 MB │ 9.941 MB │ │ + │ │ ╰─ room_with_a_view 25.33 ms │ 26.04 ms │ 25.44 ms │ 25.45 ms │ 100 │ 100 + │ │ 11.91 MB/s │ 11.59 MB/s │ 11.86 MB/s │ 11.86 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 3918 │ 3918 │ 3918 │ 3918 │ │ - │ │ 3.647 MB │ 3.647 MB │ 3.647 MB │ 3.647 MB │ │ + │ │ 2349 │ 2349 │ 2349 │ 2349 │ │ + │ │ 3.353 MB │ 3.353 MB │ 3.353 MB │ 3.353 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 3919 │ 3919 │ 3919 │ 3919 │ │ - │ │ 12.6 MB │ 12.6 MB │ 12.6 MB │ 12.6 MB │ │ + │ │ 2350 │ 2350 │ 2350 │ 2350 │ │ + │ │ 12.17 MB │ 12.17 MB │ 12.17 MB │ 12.17 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 7000 │ 7000 │ 7000 │ 7000 │ │ - │ │ 8.658 MB │ 8.658 MB │ 8.658 MB │ 8.658 MB │ │ + │ │ 6219 │ 6219 │ 6219 │ 6219 │ │ + │ │ 8.522 MB │ 8.522 MB │ 8.522 MB │ 8.522 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 3.721 ms │ 4.042 ms │ 3.763 ms │ 3.773 ms │ 100 │ 100 - │ │ │ 43.97 MB/s │ 40.47 MB/s │ 43.47 MB/s │ 43.35 MB/s │ │ + │ │ ├─ romeo_and_juliet 3.675 ms │ 3.99 ms │ 3.767 ms │ 3.769 ms │ 100 │ 100 + │ │ │ 44.52 MB/s │ 41 MB/s │ 43.42 MB/s │ 43.4 MB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 226 │ 226 │ 226 │ 226 │ │ - │ │ │ 483.5 KB │ 483.5 KB │ 483.5 KB │ 483.5 KB │ │ + │ │ │ 141 │ 141 │ 141 │ 141 │ │ + │ │ │ 406.1 KB │ 406.1 KB │ 406.1 KB │ 406.1 KB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 227 │ 227 │ 227 │ 227 │ │ - │ │ │ 2.136 MB │ 2.136 MB │ 2.136 MB │ 2.136 MB │ │ + │ │ │ 142 │ 142 │ 142 │ 142 │ │ + │ │ │ 1.99 MB │ 1.99 MB │ 1.99 MB │ 1.99 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 579 │ 579 │ 579 │ 579 │ │ - │ │ │ 1.489 MB │ 1.489 MB │ 1.489 MB │ 1.489 MB │ │ - │ │ ╰─ room_with_a_view 5.075 ms │ 5.259 ms │ 5.105 ms │ 5.113 ms │ 100 │ 100 - │ │ 59.48 MB/s │ 57.4 MB/s │ 59.13 MB/s │ 59.03 MB/s │ │ + │ │ │ 426 │ 426 │ 426 │ 426 │ │ + │ │ │ 1.42 MB │ 1.42 MB │ 1.42 MB │ 1.42 MB │ │ + │ │ ╰─ room_with_a_view 4.963 ms │ 5.236 ms │ 5.05 ms │ 5.044 ms │ 100 │ 100 + │ │ 60.82 MB/s │ 57.65 MB/s │ 59.78 MB/s │ 59.84 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 499 │ 499 │ 499 │ 499 │ │ - │ │ 543.1 KB │ 543.1 KB │ 543.1 KB │ 543.1 KB │ │ + │ │ 304 │ 304 │ 304 │ 304 │ │ + │ │ 430 KB │ 430 KB │ 430 KB │ 430 KB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 500 │ 500 │ 500 │ 500 │ │ - │ │ 2.095 MB │ 2.095 MB │ 2.095 MB │ 2.095 MB │ │ + │ │ 305 │ 305 │ 305 │ 305 │ │ + │ │ 1.886 MB │ 1.886 MB │ 1.886 MB │ 1.886 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 1068 │ 1068 │ 1068 │ 1068 │ │ - │ │ 1.25 MB │ 1.25 MB │ 1.25 MB │ 1.25 MB │ │ + │ │ 812 │ 812 │ 812 │ 812 │ │ + │ │ 1.154 MB │ 1.154 MB │ 1.154 MB │ 1.154 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.539 ms │ 1.677 ms │ 1.564 ms │ 1.572 ms │ 100 │ 100 - │ │ 106.3 MB/s │ 97.51 MB/s │ 104.5 MB/s │ 104 MB/s │ │ + │ ├─ romeo_and_juliet 1.502 ms │ 1.655 ms │ 1.585 ms │ 1.583 ms │ 100 │ 100 + │ │ 108.8 MB/s │ 98.84 MB/s │ 103.2 MB/s │ 103.3 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 46 │ 46 │ 46 │ 46 │ │ - │ │ 124.6 KB │ 124.6 KB │ 124.6 KB │ 124.6 KB │ │ + │ │ 33 │ 33 │ 33 │ 33 │ │ + │ │ 107 KB │ 107 KB │ 107 KB │ 107 KB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 47 │ 47 │ 47 │ 47 │ │ - │ │ 888.9 KB │ 888.9 KB │ 888.9 KB │ 888.9 KB │ │ + │ │ 34 │ 34 │ 34 │ 34 │ │ + │ │ 868.3 KB │ 868.3 KB │ 868.3 KB │ 868.3 KB │ │ │ │ grow: │ │ │ │ │ - │ │ 112 │ 112 │ 112 │ 112 │ │ - │ │ 600.6 KB │ 600.6 KB │ 600.6 KB │ 600.6 KB │ │ - │ ╰─ room_with_a_view 1.753 ms │ 1.852 ms │ 1.772 ms │ 1.783 ms │ 100 │ 100 - │ 172.2 MB/s │ 162.9 MB/s │ 170.3 MB/s │ 169.2 MB/s │ │ + │ │ 107 │ 107 │ 107 │ 107 │ │ + │ │ 597.6 KB │ 597.6 KB │ 597.6 KB │ 597.6 KB │ │ + │ ╰─ room_with_a_view 1.755 ms │ 1.897 ms │ 1.837 ms │ 1.834 ms │ 100 │ 100 + │ 171.9 MB/s │ 159.1 MB/s │ 164.3 MB/s │ 164.6 MB/s │ │ │ alloc: │ │ │ │ │ - │ 61 │ 61 │ 61 │ 61 │ │ - │ 60.8 KB │ 60.8 KB │ 60.8 KB │ 60.8 KB │ │ + │ 42 │ 42 │ 42 │ 42 │ │ + │ 56.02 KB │ 56.02 KB │ 56.02 KB │ 56.02 KB │ │ │ dealloc: │ │ │ │ │ - │ 62 │ 62 │ 62 │ 62 │ │ - │ 569.6 KB │ 569.6 KB │ 569.6 KB │ 569.6 KB │ │ + │ 43 │ 43 │ 43 │ 43 │ │ + │ 564.3 KB │ 564.3 KB │ 564.3 KB │ 564.3 KB │ │ │ grow: │ │ │ │ │ - │ 117 │ 117 │ 117 │ 117 │ │ - │ 206.9 KB │ 206.9 KB │ 206.9 KB │ 206.9 KB │ │ + │ 113 │ 113 │ 113 │ 113 │ │ + │ 206.4 KB │ 206.4 KB │ 206.4 KB │ 206.4 KB │ │ ├─ tiktoken │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 797.7 ms │ 842.3 ms │ 803.4 ms │ 806.3 ms │ 100 │ 100 - │ │ │ 205.1 KB/s │ 194.2 KB/s │ 203.6 KB/s │ 202.9 KB/s │ │ + │ │ ├─ romeo_and_juliet 802.3 ms │ 828.6 ms │ 812.9 ms │ 814.6 ms │ 100 │ 100 + │ │ │ 203.9 KB/s │ 197.4 KB/s │ 201.2 KB/s │ 200.8 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 8759688 │ 8759688 │ 8759688 │ 8759688 │ │ - │ │ │ 416.9 MB │ 416.9 MB │ 416.9 MB │ 416.9 MB │ │ + │ │ │ 8687901 │ 8687901 │ 8687901 │ 8687901 │ │ + │ │ │ 413.2 MB │ 413.2 MB │ 413.2 MB │ 413.2 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 9071196 │ 9071196 │ 9071196 │ 9071196 │ │ - │ │ │ 682.6 MB │ 682.6 MB │ 682.6 MB │ 682.6 MB │ │ + │ │ │ 8999409 │ 8999409 │ 8999409 │ 8999409 │ │ + │ │ │ 676.9 MB │ 676.9 MB │ 676.9 MB │ 676.9 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 1817472 │ 1817472 │ 1817472 │ 1817472 │ │ - │ │ │ 247.2 MB │ 247.2 MB │ 247.2 MB │ 247.2 MB │ │ - │ │ ╰─ room_with_a_view 1.086 s │ 1.146 s │ 1.103 s │ 1.103 s │ 100 │ 100 - │ │ 277.8 KB/s │ 263.3 KB/s │ 273.5 KB/s │ 273.6 KB/s │ │ + │ │ │ 1801556 │ 1801556 │ 1801556 │ 1801556 │ │ + │ │ │ 245.2 MB │ 245.2 MB │ 245.2 MB │ 245.2 MB │ │ + │ │ ╰─ room_with_a_view 1.064 s │ 1.153 s │ 1.072 s │ 1.077 s │ 100 │ 100 + │ │ 283.6 KB/s │ 261.8 KB/s │ 281.4 KB/s │ 280.1 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 11927663 │ 11927663 │ 11927663 │ 11927663 │ │ - │ │ 572.5 MB │ 572.5 MB │ 572.5 MB │ 572.5 MB │ │ + │ │ 11500303 │ 11500303 │ 11500303 │ 11500303 │ │ + │ │ 551.9 MB │ 551.9 MB │ 551.9 MB │ 551.9 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 12239171 │ 12239171 │ 12239171 │ 12239171 │ │ - │ │ 974.9 MB │ 974.9 MB │ 974.9 MB │ 974.9 MB │ │ + │ │ 11811811 │ 11811811 │ 11811811 │ 11811811 │ │ + │ │ 941.2 MB │ 941.2 MB │ 941.2 MB │ 941.2 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 2940302 │ 2940302 │ 2940302 │ 2940302 │ │ - │ │ 383.7 MB │ 383.7 MB │ 383.7 MB │ 383.7 MB │ │ + │ │ 2834270 │ 2834270 │ 2834270 │ 2834270 │ │ + │ │ 370.6 MB │ 370.6 MB │ 370.6 MB │ 370.6 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 265.4 ms │ 269.1 ms │ 266.4 ms │ 266.6 ms │ 100 │ 100 - │ │ │ 616.4 KB/s │ 607.8 KB/s │ 614 KB/s │ 613.6 KB/s │ │ + │ │ ├─ romeo_and_juliet 262.6 ms │ 268.1 ms │ 264.2 ms │ 264.4 ms │ 100 │ 100 + │ │ │ 622.8 KB/s │ 610.3 KB/s │ 619.1 KB/s │ 618.8 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2967096 │ 2967096 │ 2967096 │ 2967096 │ │ - │ │ │ 139.6 MB │ 139.6 MB │ 139.6 MB │ 139.6 MB │ │ + │ │ │ 2921740 │ 2921740 │ 2921740 │ 2921740 │ │ + │ │ │ 137.3 MB │ 137.3 MB │ 137.3 MB │ 137.3 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 3278604 │ 3278604 │ 3278604 │ 3278604 │ │ - │ │ │ 237.9 MB │ 237.9 MB │ 237.9 MB │ 237.9 MB │ │ + │ │ │ 3233248 │ 3233248 │ 3233248 │ 3233248 │ │ + │ │ │ 234.4 MB │ 234.4 MB │ 234.4 MB │ 234.4 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 615666 │ 615666 │ 615666 │ 615666 │ │ - │ │ │ 79.86 MB │ 79.86 MB │ 79.86 MB │ 79.86 MB │ │ - │ │ ╰─ room_with_a_view 464.5 ms │ 470.8 ms │ 467.7 ms │ 467.6 ms │ 100 │ 100 - │ │ 649.9 KB/s │ 641.1 KB/s │ 645.3 KB/s │ 645.6 KB/s │ │ + │ │ │ 606069 │ 606069 │ 606069 │ 606069 │ │ + │ │ │ 78.61 MB │ 78.61 MB │ 78.61 MB │ 78.61 MB │ │ + │ │ ╰─ room_with_a_view 438.5 ms │ 460.3 ms │ 443.3 ms │ 444.8 ms │ 100 │ 100 + │ │ 688.4 KB/s │ 655.8 KB/s │ 681 KB/s │ 678.7 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 5146321 │ 5146321 │ 5146321 │ 5146321 │ │ - │ │ 245 MB │ 245 MB │ 245 MB │ 245 MB │ │ + │ │ 4881128 │ 4881128 │ 4881128 │ 4881128 │ │ + │ │ 232.3 MB │ 232.3 MB │ 232.3 MB │ 232.3 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 5457829 │ 5457829 │ 5457829 │ 5457829 │ │ - │ │ 424.8 MB │ 424.8 MB │ 424.8 MB │ 424.8 MB │ │ + │ │ 5192636 │ 5192636 │ 5192636 │ 5192636 │ │ + │ │ 403.9 MB │ 403.9 MB │ 403.9 MB │ 403.9 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 1262554 │ 1262554 │ 1262554 │ 1262554 │ │ - │ │ 161.2 MB │ 161.2 MB │ 161.2 MB │ 161.2 MB │ │ + │ │ 1197369 │ 1197369 │ 1197369 │ 1197369 │ │ + │ │ 152.9 MB │ 152.9 MB │ 152.9 MB │ 152.9 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 184.2 ms │ 187.1 ms │ 185.2 ms │ 185.2 ms │ 100 │ 100 - │ │ │ 888.1 KB/s │ 874.2 KB/s │ 883.2 KB/s │ 883 KB/s │ │ + │ │ ├─ romeo_and_juliet 178.2 ms │ 182.5 ms │ 179.4 ms │ 179.5 ms │ 100 │ 100 + │ │ │ 917.7 KB/s │ 896.2 KB/s │ 911.6 KB/s │ 911.4 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2079255 │ 2079255 │ 2079255 │ 2079255 │ │ - │ │ │ 97.41 MB │ 97.41 MB │ 97.41 MB │ 97.41 MB │ │ + │ │ │ 2018346 │ 2018346 │ 2018346 │ 2018346 │ │ + │ │ │ 94.51 MB │ 94.51 MB │ 94.51 MB │ 94.51 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 2390763 │ 2390763 │ 2390763 │ 2390763 │ │ - │ │ │ 170.8 MB │ 170.8 MB │ 170.8 MB │ 170.8 MB │ │ + │ │ │ 2329854 │ 2329854 │ 2329854 │ 2329854 │ │ + │ │ │ 166.3 MB │ 166.3 MB │ 166.3 MB │ 166.3 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 431092 │ 431092 │ 431092 │ 431092 │ │ - │ │ │ 55 MB │ 55 MB │ 55 MB │ 55 MB │ │ - │ │ ╰─ room_with_a_view 339.3 ms │ 345.3 ms │ 341.7 ms │ 341.9 ms │ 100 │ 100 - │ │ 889.7 KB/s │ 874.3 KB/s │ 883.4 KB/s │ 882.8 KB/s │ │ + │ │ │ 418451 │ 418451 │ 418451 │ 418451 │ │ + │ │ │ 53.36 MB │ 53.36 MB │ 53.36 MB │ 53.36 MB │ │ + │ │ ╰─ room_with_a_view 319.3 ms │ 326.4 ms │ 323 ms │ 323 ms │ 100 │ 100 + │ │ 945.3 KB/s │ 924.8 KB/s │ 934.4 KB/s │ 934.4 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 3773135 │ 3773135 │ 3773135 │ 3773135 │ │ - │ │ 179.2 MB │ 179.2 MB │ 179.2 MB │ 179.2 MB │ │ + │ │ 3573121 │ 3573121 │ 3573121 │ 3573121 │ │ + │ │ 169.7 MB │ 169.7 MB │ 169.7 MB │ 169.7 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 4084643 │ 4084643 │ 4084643 │ 4084643 │ │ - │ │ 315 MB │ 315 MB │ 315 MB │ 315 MB │ │ + │ │ 3884629 │ 3884629 │ 3884629 │ 3884629 │ │ + │ │ 299.2 MB │ 299.2 MB │ 299.2 MB │ 299.2 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 923647 │ 923647 │ 923647 │ 923647 │ │ - │ │ 117.1 MB │ 117.1 MB │ 117.1 MB │ 117.1 MB │ │ + │ │ 874507 │ 874507 │ 874507 │ 874507 │ │ + │ │ 110.8 MB │ 110.8 MB │ 110.8 MB │ 110.8 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 82.98 ms │ 89.91 ms │ 83.42 ms │ 83.82 ms │ 100 │ 100 - │ │ 1.971 MB/s │ 1.819 MB/s │ 1.961 MB/s │ 1.951 MB/s │ │ + │ ├─ romeo_and_juliet 81.77 ms │ 89.6 ms │ 82.34 ms │ 82.77 ms │ 100 │ 100 + │ │ 2 MB/s │ 1.826 MB/s │ 1.987 MB/s │ 1.976 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 915519 │ 915519 │ 915519 │ 915519 │ │ - │ │ 42.89 MB │ 42.89 MB │ 42.89 MB │ 42.89 MB │ │ + │ │ 914681 │ 914681 │ 914681 │ 914681 │ │ + │ │ 42.86 MB │ 42.86 MB │ 42.86 MB │ 42.86 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 1227027 │ 1227027 │ 1227027 │ 1227027 │ │ - │ │ 85.77 MB │ 85.77 MB │ 85.77 MB │ 85.77 MB │ │ + │ │ 1226189 │ 1226189 │ 1226189 │ 1226189 │ │ + │ │ 85.71 MB │ 85.71 MB │ 85.71 MB │ 85.71 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 187921 │ 187921 │ 187921 │ 187921 │ │ - │ │ 24.4 MB │ 24.4 MB │ 24.4 MB │ 24.4 MB │ │ - │ ╰─ room_with_a_view 125.2 ms │ 129.9 ms │ 126.6 ms │ 126.8 ms │ 100 │ 100 - │ 2.409 MB/s │ 2.323 MB/s │ 2.384 MB/s │ 2.38 MB/s │ │ + │ │ 187707 │ 187707 │ 187707 │ 187707 │ │ + │ │ 24.37 MB │ 24.37 MB │ 24.37 MB │ 24.37 MB │ │ + │ ╰─ room_with_a_view 112.1 ms │ 117.4 ms │ 112.9 ms │ 113.3 ms │ 100 │ 100 + │ 2.691 MB/s │ 2.57 MB/s │ 2.672 MB/s │ 2.663 MB/s │ │ │ alloc: │ │ │ │ │ - │ 1370555 │ 1370555 │ 1370555 │ 1370555 │ │ - │ 65.15 MB │ 65.15 MB │ 65.15 MB │ 65.15 MB │ │ + │ 1232443 │ 1232443 │ 1232443 │ 1232443 │ │ + │ 58.6 MB │ 58.6 MB │ 58.6 MB │ 58.6 MB │ │ │ dealloc: │ │ │ │ │ - │ 1682063 │ 1682063 │ 1682063 │ 1682063 │ │ - │ 126.3 MB │ 126.3 MB │ 126.3 MB │ 126.3 MB │ │ + │ 1543951 │ 1543951 │ 1543951 │ 1543951 │ │ + │ 115.4 MB │ 115.4 MB │ 115.4 MB │ 115.4 MB │ │ │ grow: │ │ │ │ │ - │ 334112 │ 334112 │ 334112 │ 334112 │ │ - │ 42.55 MB │ 42.55 MB │ 42.55 MB │ 42.55 MB │ │ + │ 300739 │ 300739 │ 300739 │ 300739 │ │ + │ 38.19 MB │ 38.19 MB │ 38.19 MB │ 38.19 MB │ │ ╰─ tokenizers │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.409 s │ 1.519 s │ 1.426 s │ 1.43 s │ 100 │ 100 - │ │ 116.1 KB/s │ 107.6 KB/s │ 114.6 KB/s │ 114.3 KB/s │ │ + │ ├─ romeo_and_juliet 1.381 s │ 1.448 s │ 1.404 s │ 1.403 s │ 100 │ 100 + │ │ 118.4 KB/s │ 112.9 KB/s │ 116.4 KB/s │ 116.5 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 29414066 │ 29414066 │ 29414066 │ 29414066 │ │ - │ │ 3.628 GB │ 3.628 GB │ 3.628 GB │ 3.628 GB │ │ + │ │ 29188728 │ 29188728 │ 29188728 │ 29188728 │ │ + │ │ 3.601 GB │ 3.601 GB │ 3.601 GB │ 3.601 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 29472119 │ 29472119 │ 29472119 │ 29472119 │ │ - │ │ 5.252 GB │ 5.252 GB │ 5.252 GB │ 5.252 GB │ │ + │ │ 29246781 │ 29246781 │ 29246781 │ 29246781 │ │ + │ │ 5.214 GB │ 5.214 GB │ 5.214 GB │ 5.214 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 474777 │ 474777 │ 474777 │ 474777 │ │ - │ │ 1.619 GB │ 1.619 GB │ 1.619 GB │ 1.619 GB │ │ - │ ╰─ room_with_a_view 1.888 s │ 2.313 s │ 1.954 s │ 1.977 s │ 100 │ 100 - │ 159.8 KB/s │ 130.4 KB/s │ 154.4 KB/s │ 152.6 KB/s │ │ + │ │ 463032 │ 463032 │ 463032 │ 463032 │ │ + │ │ 1.608 GB │ 1.608 GB │ 1.608 GB │ 1.608 GB │ │ + │ ╰─ room_with_a_view 1.906 s │ 1.946 s │ 1.92 s │ 1.922 s │ 100 │ 100 + │ 158.3 KB/s │ 155.1 KB/s │ 157.2 KB/s │ 157 KB/s │ │ │ alloc: │ │ │ │ │ - │ 40892031 │ 40892031 │ 40892031 │ 40892031 │ │ - │ 5.348 GB │ 5.348 GB │ 5.348 GB │ 5.348 GB │ │ + │ 39390416 │ 39390416 │ 39390416 │ 39390416 │ │ + │ 5.158 GB │ 5.158 GB │ 5.158 GB │ 5.158 GB │ │ │ dealloc: │ │ │ │ │ - │ 40950084 │ 40950084 │ 40950084 │ 40950084 │ │ - │ 7.688 GB │ 7.688 GB │ 7.688 GB │ 7.688 GB │ │ + │ 39448469 │ 39448469 │ 39448469 │ 39448469 │ │ + │ 7.421 GB │ 7.421 GB │ 7.421 GB │ 7.421 GB │ │ │ grow: │ │ │ │ │ - │ 758413 │ 758413 │ 758413 │ 758413 │ │ - │ 2.335 GB │ 2.335 GB │ 2.335 GB │ 2.335 GB │ │ + │ 681205 │ 681205 │ 681205 │ 681205 │ │ + │ 2.257 GB │ 2.257 GB │ 2.257 GB │ 2.257 GB │ │ ├─ 512 │ │ │ │ │ - │ ├─ romeo_and_juliet 433.3 ms │ 460.4 ms │ 445.9 ms │ 442 ms │ 100 │ 100 - │ │ 377.6 KB/s │ 355.3 KB/s │ 366.9 KB/s │ 370.1 KB/s │ │ + │ ├─ romeo_and_juliet 430.1 ms │ 497.1 ms │ 439.3 ms │ 442.9 ms │ 100 │ 100 + │ │ 380.3 KB/s │ 329.1 KB/s │ 372.4 KB/s │ 369.3 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 9473831 │ 9473831 │ 9473831 │ 9473831 │ │ - │ │ 1.177 GB │ 1.177 GB │ 1.177 GB │ 1.177 GB │ │ + │ │ 9331641 │ 9331641 │ 9331641 │ 9331641 │ │ + │ │ 1.159 GB │ 1.159 GB │ 1.159 GB │ 1.159 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 9531884 │ 9531884 │ 9531884 │ 9531884 │ │ - │ │ 1.703 GB │ 1.703 GB │ 1.703 GB │ 1.703 GB │ │ + │ │ 9389694 │ 9389694 │ 9389694 │ 9389694 │ │ + │ │ 1.678 GB │ 1.678 GB │ 1.678 GB │ 1.678 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 101662 │ 101662 │ 101662 │ 101662 │ │ - │ │ 521.2 MB │ 521.2 MB │ 521.2 MB │ 521.2 MB │ │ - │ ╰─ room_with_a_view 818.3 ms │ 841.4 ms │ 824.1 ms │ 824.6 ms │ 100 │ 100 - │ 368.9 KB/s │ 358.8 KB/s │ 366.3 KB/s │ 366.1 KB/s │ │ + │ │ 100059 │ 100059 │ 100059 │ 100059 │ │ + │ │ 513.7 MB │ 513.7 MB │ 513.7 MB │ 513.7 MB │ │ + │ ╰─ room_with_a_view 786.4 ms │ 882.8 ms │ 791.5 ms │ 796.9 ms │ 100 │ 100 + │ 383.8 KB/s │ 341.9 KB/s │ 381.3 KB/s │ 378.8 KB/s │ │ │ alloc: │ │ │ │ │ - │ 17221349 │ 17221349 │ 17221349 │ 17221349 │ │ - │ 2.27 GB │ 2.27 GB │ 2.27 GB │ 2.27 GB │ │ + │ 16335240 │ 16335240 │ 16335240 │ 16335240 │ │ + │ 2.154 GB │ 2.154 GB │ 2.154 GB │ 2.154 GB │ │ │ dealloc: │ │ │ │ │ - │ 17279402 │ 17279402 │ 17279402 │ 17279402 │ │ - │ 3.271 GB │ 3.271 GB │ 3.271 GB │ 3.271 GB │ │ + │ 16393293 │ 16393293 │ 16393293 │ 16393293 │ │ + │ 3.105 GB │ 3.105 GB │ 3.105 GB │ 3.105 GB │ │ │ grow: │ │ │ │ │ - │ 178717 │ 178717 │ 178717 │ 178717 │ │ - │ 995.5 MB │ 995.5 MB │ 995.5 MB │ 995.5 MB │ │ + │ 165455 │ 165455 │ 165455 │ 165455 │ │ + │ 945.8 MB │ 945.8 MB │ 945.8 MB │ 945.8 MB │ │ ├─ 4096 │ │ │ │ │ - │ ├─ romeo_and_juliet 301.8 ms │ 307.4 ms │ 303.1 ms │ 303.2 ms │ 100 │ 100 - │ │ 542 KB/s │ 532.2 KB/s │ 539.8 KB/s │ 539.5 KB/s │ │ + │ ├─ romeo_and_juliet 294.7 ms │ 301.6 ms │ 295.5 ms │ 296.4 ms │ 100 │ 100 + │ │ 555 KB/s │ 542.3 KB/s │ 553.6 KB/s │ 551.8 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 6629037 │ 6629037 │ 6629037 │ 6629037 │ │ - │ │ 826.9 MB │ 826.9 MB │ 826.9 MB │ 826.9 MB │ │ + │ │ 6432740 │ 6432740 │ 6432740 │ 6432740 │ │ + │ │ 802.5 MB │ 802.5 MB │ 802.5 MB │ 802.5 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 6687090 │ 6687090 │ 6687090 │ 6687090 │ │ - │ │ 1.199 GB │ 1.199 GB │ 1.199 GB │ 1.199 GB │ │ + │ │ 6490793 │ 6490793 │ 6490793 │ 6490793 │ │ + │ │ 1.164 GB │ 1.164 GB │ 1.164 GB │ 1.164 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 35655 │ 35655 │ 35655 │ 35655 │ │ - │ │ 367.7 MB │ 367.7 MB │ 367.7 MB │ 367.7 MB │ │ - │ ╰─ room_with_a_view 583.7 ms │ 595.6 ms │ 588.2 ms │ 588.8 ms │ 100 │ 100 - │ 517.1 KB/s │ 506.8 KB/s │ 513.2 KB/s │ 512.6 KB/s │ │ + │ │ 34250 │ 34250 │ 34250 │ 34250 │ │ + │ │ 356.8 MB │ 356.8 MB │ 356.8 MB │ 356.8 MB │ │ + │ ╰─ room_with_a_view 550.8 ms │ 580.6 ms │ 565.2 ms │ 561.8 ms │ 100 │ 100 + │ 548.1 KB/s │ 519.9 KB/s │ 534.1 KB/s │ 537.3 KB/s │ │ │ alloc: │ │ │ │ │ - │ 12332767 │ 12332767 │ 12332767 │ 12332767 │ │ - │ 1.632 GB │ 1.632 GB │ 1.632 GB │ 1.632 GB │ │ + │ 11604279 │ 11604279 │ 11604279 │ 11604279 │ │ + │ 1.536 GB │ 1.536 GB │ 1.536 GB │ 1.536 GB │ │ │ dealloc: │ │ │ │ │ - │ 12390820 │ 12390820 │ 12390820 │ 12390820 │ │ - │ 2.353 GB │ 2.353 GB │ 2.353 GB │ 2.353 GB │ │ + │ 11662332 │ 11662332 │ 11662332 │ 11662332 │ │ + │ 2.215 GB │ 2.215 GB │ 2.215 GB │ 2.215 GB │ │ │ grow: │ │ │ │ │ - │ 61498 │ 61498 │ 61498 │ 61498 │ │ - │ 716.5 MB │ 716.5 MB │ 716.5 MB │ 716.5 MB │ │ + │ 55754 │ 55754 │ 55754 │ 55754 │ │ + │ 674.3 MB │ 674.3 MB │ 674.3 MB │ 674.3 MB │ │ ╰─ 32768 │ │ │ │ │ - ├─ romeo_and_juliet 132.7 ms │ 136.2 ms │ 133.1 ms │ 133.4 ms │ 100 │ 100 - │ 1.232 MB/s │ 1.2 MB/s │ 1.228 MB/s │ 1.226 MB/s │ │ + ├─ romeo_and_juliet 130.7 ms │ 133.1 ms │ 131.2 ms │ 131.2 ms │ 100 │ 100 + │ 1.251 MB/s │ 1.228 MB/s │ 1.246 MB/s │ 1.246 MB/s │ │ │ alloc: │ │ │ │ │ - │ 2848170 │ 2848170 │ 2848170 │ 2848170 │ │ - │ 354.1 MB │ 354.1 MB │ 354.1 MB │ 354.1 MB │ │ + │ 2845252 │ 2845252 │ 2845252 │ 2845252 │ │ + │ 353.7 MB │ 353.7 MB │ 353.7 MB │ 353.7 MB │ │ │ dealloc: │ │ │ │ │ - │ 2906223 │ 2906223 │ 2906223 │ 2906223 │ │ - │ 518 MB │ 518 MB │ 518 MB │ 518 MB │ │ + │ 2903305 │ 2903305 │ 2903305 │ 2903305 │ │ + │ 517.5 MB │ 517.5 MB │ 517.5 MB │ 517.5 MB │ │ │ grow: │ │ │ │ │ - │ 9817 │ 9817 │ 9817 │ 9817 │ │ - │ 159 MB │ 159 MB │ 159 MB │ 159 MB │ │ - ╰─ room_with_a_view 220.3 ms │ 232.3 ms │ 221 ms │ 221.5 ms │ 100 │ 100 - 1.369 MB/s │ 1.299 MB/s │ 1.365 MB/s │ 1.362 MB/s │ │ + │ 9650 │ 9650 │ 9650 │ 9650 │ │ + │ 158.8 MB │ 158.8 MB │ 158.8 MB │ 158.8 MB │ │ + ╰─ room_with_a_view 215.9 ms │ 222.8 ms │ 218.8 ms │ 218.4 ms │ 100 │ 100 + 1.398 MB/s │ 1.354 MB/s │ 1.379 MB/s │ 1.382 MB/s │ │ alloc: │ │ │ │ │ - 4491022 │ 4491022 │ 4491022 │ 4491022 │ │ - 594.4 MB │ 594.4 MB │ 594.4 MB │ 594.4 MB │ │ + 4490084 │ 4490084 │ 4490084 │ 4490084 │ │ + 594.3 MB │ 594.3 MB │ 594.3 MB │ 594.3 MB │ │ dealloc: │ │ │ │ │ - 4549075 │ 4549075 │ 4549075 │ 4549075 │ │ - 861 MB │ 861 MB │ 861 MB │ 861 MB │ │ + 4548137 │ 4548137 │ 4548137 │ 4548137 │ │ + 860.9 MB │ 860.9 MB │ 860.9 MB │ 860.9 MB │ │ grow: │ │ │ │ │ - 14385 │ 14385 │ 14385 │ 14385 │ │ + 14226 │ 14226 │ 14226 │ 14226 │ │ 261.5 MB │ 261.5 MB │ 261.5 MB │ 261.5 MB │ │ diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 4014fc3..6bd15bc 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -2,6 +2,19 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.2" @@ -757,6 +770,7 @@ checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" name = "text-splitter" version = "0.7.0" dependencies = [ + "ahash", "auto_enums", "either", "itertools 0.12.1", @@ -944,3 +958,23 @@ name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] diff --git a/src/lib.rs b/src/lib.rs index 9493f1d..c9e7c14 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ use std::{ ops::{Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}, }; +use ahash::AHashMap; use itertools::Itertools; mod characters; @@ -83,6 +84,53 @@ pub trait ChunkSizer { fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize; } +/// A memoized chunk sizer that caches the size of chunks. +/// Very helpful when the same chunk is being validated multiple times, which +/// happens often, and can be expensive to compute, such as with tokenizers. +#[derive(Debug)] +struct MemoizedChunkSizer<'sizer, S> +where + S: ChunkSizer, +{ + /// The sizer we are wrapping + sizer: &'sizer S, + /// Cache of chunk sizes per byte offset range + cache: AHashMap, ChunkSize>, +} + +impl<'sizer, S> MemoizedChunkSizer<'sizer, S> +where + S: ChunkSizer, +{ + /// Wrap any chunk sizer for memoization + fn new(sizer: &'sizer S) -> Self { + Self { + sizer, + cache: AHashMap::default(), + } + } + + /// Determine the size of a given chunk to use for validation, + /// returning a cached value if it exists, and storing the result if not. + fn chunk_size( + &mut self, + offset: usize, + chunk: &str, + capacity: &impl ChunkCapacity, + ) -> ChunkSize { + *self + .cache + .entry(offset..(offset + chunk.len())) + .or_insert_with(|| self.sizer.chunk_size(chunk, capacity)) + } + + /// Clear the cached values. Once we've moved the cursor, + /// we don't need to keep the old values around. + fn clear_cache(&mut self) { + self.cache.clear(); + } +} + /// Describes the largest valid chunk size(s) that can be generated. /// /// An `end` size is required, which is the maximum possible chunk size that @@ -276,9 +324,11 @@ where /// Size of the chunks to generate chunk_capacity: C, /// How to validate chunk sizes - chunk_sizer: &'sizer S, + chunk_sizer: MemoizedChunkSizer<'sizer, S>, /// Current byte offset in the `text` cursor: usize, + /// Reusable container for levels in remaining text to avoid extra allocations + levels_in_remaining_text: Vec, /// Reusable container for next sections to avoid extra allocations next_sections: Vec<(usize, &'text str)>, /// Splitter used for determining semantic levels. @@ -301,7 +351,8 @@ where Self { cursor: 0, chunk_capacity, - chunk_sizer, + chunk_sizer: MemoizedChunkSizer::new(chunk_sizer), + levels_in_remaining_text: Vec::new(), next_sections: Vec::new(), semantic_split: Sp::new(text), text, @@ -319,9 +370,11 @@ where } /// Is the given text within the chunk size? - fn check_capacity(&self, offset: usize, chunk: &str) -> ChunkSize { + fn check_capacity(&mut self, offset: usize, chunk: &str) -> ChunkSize { let (offset, chunk) = self.trim_chunk(offset, chunk); - let mut chunk_size = self.chunk_sizer.chunk_size(chunk, &self.chunk_capacity); + let mut chunk_size = self + .chunk_sizer + .chunk_size(offset, chunk, &self.chunk_capacity); if let Some(max_chunk_size_offset) = chunk_size.max_chunk_size_offset.as_mut() { *max_chunk_size_offset += offset; } @@ -332,19 +385,17 @@ where /// Returns final byte offset and str. /// Will return `None` if given an invalid range. fn next_chunk(&mut self) -> Option<(usize, &'text str)> { + // Reset caches so we can reuse the memory allocation + self.chunk_sizer.clear_cache(); + self.update_next_sections(); + let start = self.cursor; let mut end = self.cursor; let mut equals_found = false; - - self.update_next_sections(); - let mut sizes = self - .next_sections - .iter() - .map(|_| None) - .collect::>>(); let mut low = 0; let mut high = self.next_sections.len().saturating_sub(1); let mut successful_index = None; + let mut successful_chunk_size = None; while low <= high { let mid = low + (high - low) / 2; @@ -352,7 +403,6 @@ where let text_end = offset + str.len(); let chunk = self.text.get(start..text_end)?; let chunk_size = self.check_capacity(start, chunk); - sizes[mid] = Some(chunk_size); match chunk_size.fits { Ordering::Less => { @@ -360,6 +410,7 @@ where if text_end > end { end = text_end; successful_index = Some(mid); + successful_chunk_size = Some(chunk_size); } } Ordering::Equal => { @@ -367,6 +418,7 @@ where if text_end < end || !equals_found { end = text_end; successful_index = Some(mid); + successful_chunk_size = Some(chunk_size); } equals_found = true; } @@ -375,6 +427,7 @@ where if mid == 0 && start == end { end = text_end; successful_index = Some(mid); + successful_chunk_size = Some(chunk_size); } } }; @@ -391,36 +444,20 @@ where } // Sometimes with tokenization, we can get a bigger chunk for the same amount of tokens. - if let Some((successful_index, chunk_size)) = - successful_index.and_then(|successful_index| { - Some((successful_index, sizes.get(successful_index)?.as_ref()?)) - }) + if let (Some(successful_index), Some(chunk_size)) = + (successful_index, successful_chunk_size) { - for (size, (offset, str)) in sizes - .iter() - .zip(self.next_sections.iter()) - .skip(successful_index) - { + for index in successful_index..self.next_sections.len() { + let (offset, str) = self.next_sections[index]; let text_end = offset + str.len(); - match size { - Some(size) if size.size <= chunk_size.size => { - if text_end > end { - end = text_end; - } - } - // We didn't tokenize this section yet - None => { - let chunk = self.text.get(start..text_end)?; - let size = self.check_capacity(start, chunk); - if size.size <= chunk_size.size { - if text_end > end { - end = text_end; - } - } else { - break; - } + let chunk = self.text.get(start..text_end)?; + let size = self.check_capacity(start, chunk); + if size.size <= chunk_size.size { + if text_end > end { + end = text_end; } - _ => break, + } else { + break; } } } @@ -439,11 +476,13 @@ where fn update_next_sections(&mut self) { // First thing, clear out the list, but reuse the allocated memory self.next_sections.clear(); + self.levels_in_remaining_text.clear(); // Next levels to try. Will stop at max level. We check only levels in the next max level // chunk so we don't bypass it if not all levels are present in every chunk. - let mut levels = self.semantic_split.levels_in_remaining_text(self.cursor); + self.levels_in_remaining_text + .extend(self.semantic_split.levels_in_remaining_text(self.cursor)); // Get starting level - let Some(mut semantic_level) = levels.next() else { + let Some(mut semantic_level) = self.levels_in_remaining_text.first().copied() else { return; }; // If we aren't at the highest semantic level, stop iterating sections that go beyond the range of the next level. @@ -451,7 +490,8 @@ where let remaining_text = self.text.get(self.cursor..).unwrap(); - for level in levels { + for i in 0..self.levels_in_remaining_text.len() { + let level = self.levels_in_remaining_text[i]; let Some((_, str)) = self .semantic_split .semantic_chunks(self.cursor, remaining_text, level) @@ -512,6 +552,8 @@ where #[cfg(test)] mod tests { + use std::sync::atomic::{self, AtomicUsize}; + use super::*; #[test] @@ -633,4 +675,60 @@ mod tests { chunk_size ); } + + #[derive(Default)] + struct CountingSizer { + calls: AtomicUsize, + } + + impl ChunkSizer for CountingSizer { + // Return character version, but count calls + fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize { + self.calls.fetch_add(1, atomic::Ordering::SeqCst); + Characters.chunk_size(chunk, capacity) + } + } + + #[test] + fn memoized_sizer_only_calculates_once_per_text() { + let sizer = CountingSizer::default(); + let mut memoized_sizer = MemoizedChunkSizer::new(&sizer); + let text = "1234567890"; + for _ in 0..10 { + memoized_sizer.chunk_size(0, text, &10); + } + + assert_eq!(memoized_sizer.sizer.calls.load(atomic::Ordering::SeqCst), 1); + } + + #[test] + fn memoized_sizer_calculates_once_per_different_text() { + let sizer = CountingSizer::default(); + let mut memoized_sizer = MemoizedChunkSizer::new(&sizer); + let text = "1234567890"; + for i in 0..10 { + memoized_sizer.chunk_size(0, text.get(0..i).unwrap(), &10); + } + + assert_eq!( + memoized_sizer.sizer.calls.load(atomic::Ordering::SeqCst), + 10 + ); + } + + #[test] + fn can_clear_cache_on_memoized_sizer() { + let sizer = CountingSizer::default(); + let mut memoized_sizer = MemoizedChunkSizer::new(&sizer); + let text = "1234567890"; + for _ in 0..10 { + memoized_sizer.chunk_size(0, text, &10); + memoized_sizer.clear_cache(); + } + + assert_eq!( + memoized_sizer.sizer.calls.load(atomic::Ordering::SeqCst), + 10 + ); + } } From 8f681cd77b20f805004001b29387bb303296d6e1 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Sun, 24 Mar 2024 08:58:02 +0100 Subject: [PATCH 3/6] Simplify the level selection logic --- benches/output.txt | 198 ++++++++++++++++++++++----------------------- src/lib.rs | 11 +-- 2 files changed, 103 insertions(+), 106 deletions(-) diff --git a/benches/output.txt b/benches/output.txt index 46c2d47..b10667f 100644 --- a/benches/output.txt +++ b/benches/output.txt @@ -7,23 +7,23 @@ chunk_size fastest │ slowest │ median ├─ markdown │ │ │ │ │ │ ├─ characters │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 347.8 ms │ 381.3 ms │ 351.7 ms │ 355.2 ms │ 100 │ 100 -│ │ │ 589.4 KB/s │ 537.5 KB/s │ 582.8 KB/s │ 577.2 KB/s │ │ +│ │ │ ╰─ commonmark_spec 350.7 ms │ 377.3 ms │ 352.4 ms │ 352.7 ms │ 100 │ 100 +│ │ │ 584.5 KB/s │ 543.3 KB/s │ 581.7 KB/s │ 581.1 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 13270 │ 13270 │ 13270 │ 13137 │ │ -│ │ │ 75.99 MB │ 75.99 MB │ 75.99 MB │ 75.23 MB │ │ +│ │ │ 13270 │ 0 │ 13270 │ 13137 │ │ +│ │ │ 75.99 MB │ 0 B │ 75.99 MB │ 75.23 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 13271 │ 13271 │ 13271 │ 13138 │ │ -│ │ │ 281.4 MB │ 281.4 MB │ 281.4 MB │ 278.6 MB │ │ +│ │ │ 13271 │ 0 │ 13271 │ 13138 │ │ +│ │ │ 281.4 MB │ 0 B │ 281.4 MB │ 278.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 44735 │ 44735 │ 44735 │ 44287 │ │ -│ │ │ 205.2 MB │ 205.2 MB │ 205.2 MB │ 203.2 MB │ │ +│ │ │ 44735 │ 0 │ 44735 │ 44287 │ │ +│ │ │ 205.2 MB │ 0 B │ 205.2 MB │ 203.2 MB │ │ │ │ │ shrink: │ │ │ │ │ -│ │ │ 13 │ 13 │ 13 │ 12.87 │ │ -│ │ │ 94 B │ 94 B │ 94 B │ 93.06 B │ │ +│ │ │ 13 │ 0 │ 13 │ 12.87 │ │ +│ │ │ 94 B │ 0 B │ 94 B │ 93.06 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 47.33 ms │ 48.23 ms │ 47.57 ms │ 47.59 ms │ 100 │ 100 -│ │ │ 4.331 MB/s │ 4.25 MB/s │ 4.309 MB/s │ 4.307 MB/s │ │ +│ │ │ ╰─ commonmark_spec 47.85 ms │ 48.93 ms │ 48.15 ms │ 48.17 ms │ 100 │ 100 +│ │ │ 4.284 MB/s │ 4.189 MB/s │ 4.258 MB/s │ 4.256 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 1576 │ 1576 │ 1576 │ 1576 │ │ │ │ │ 9.241 MB │ 9.241 MB │ 9.241 MB │ 9.241 MB │ │ @@ -37,8 +37,8 @@ chunk_size fastest │ slowest │ median │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 8.677 ms │ 9.057 ms │ 8.77 ms │ 8.771 ms │ 100 │ 100 -│ │ │ 23.62 MB/s │ 22.63 MB/s │ 23.37 MB/s │ 23.37 MB/s │ │ +│ │ │ ╰─ commonmark_spec 8.767 ms │ 9.241 ms │ 8.87 ms │ 8.88 ms │ 100 │ 100 +│ │ │ 23.38 MB/s │ 22.18 MB/s │ 23.11 MB/s │ 23.08 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 261 │ 261 │ 261 │ 261 │ │ │ │ │ 1.663 MB │ 1.663 MB │ 1.663 MB │ 1.663 MB │ │ @@ -52,8 +52,8 @@ chunk_size fastest │ slowest │ median │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 2.023 ms │ 2.209 ms │ 2.087 ms │ 2.077 ms │ 100 │ 100 -│ │ 101.3 MB/s │ 92.8 MB/s │ 98.21 MB/s │ 98.69 MB/s │ │ +│ │ ╰─ commonmark_spec 2.039 ms │ 2.24 ms │ 2.104 ms │ 2.112 ms │ 100 │ 100 +│ │ 100.5 MB/s │ 91.49 MB/s │ 97.41 MB/s │ 97.07 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 66 │ 66 │ 66 │ 66 │ │ │ │ 528.4 KB │ 528.4 KB │ 528.4 KB │ 528.4 KB │ │ @@ -68,8 +68,8 @@ chunk_size fastest │ slowest │ median │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ tiktoken │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 867.9 ms │ 899.5 ms │ 874.4 ms │ 874.7 ms │ 100 │ 100 -│ │ │ 236.2 KB/s │ 227.9 KB/s │ 234.4 KB/s │ 234.3 KB/s │ │ +│ │ │ ╰─ commonmark_spec 863.1 ms │ 876.3 ms │ 866.4 ms │ 866.7 ms │ 100 │ 100 +│ │ │ 237.5 KB/s │ 233.9 KB/s │ 236.6 KB/s │ 236.5 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 8103002 │ 8103002 │ 8103002 │ 8103002 │ │ │ │ │ 394.3 MB │ 394.3 MB │ 394.3 MB │ 394.3 MB │ │ @@ -83,8 +83,8 @@ chunk_size fastest │ slowest │ median │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 288.3 ms │ 294.6 ms │ 289.8 ms │ 290.3 ms │ 100 │ 100 -│ │ │ 711 KB/s │ 695.7 KB/s │ 707.2 KB/s │ 706 KB/s │ │ +│ │ │ ╰─ commonmark_spec 287.8 ms │ 293.5 ms │ 289.3 ms │ 289.3 ms │ 100 │ 100 +│ │ │ 712.3 KB/s │ 698.5 KB/s │ 708.5 KB/s │ 708.5 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 2947758 │ 2947758 │ 2947758 │ 2947758 │ │ │ │ │ 138.2 MB │ 138.2 MB │ 138.2 MB │ 138.2 MB │ │ @@ -98,23 +98,23 @@ chunk_size fastest │ slowest │ median │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 157.6 ms │ 166.5 ms │ 160.8 ms │ 160.9 ms │ 100 │ 100 -│ │ │ 1.3 MB/s │ 1.231 MB/s │ 1.274 MB/s │ 1.274 MB/s │ │ +│ │ │ ╰─ commonmark_spec 156.1 ms │ 159.7 ms │ 156.8 ms │ 156.8 ms │ 100 │ 100 +│ │ │ 1.312 MB/s │ 1.283 MB/s │ 1.307 MB/s │ 1.306 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 1658609 │ 1658609 │ 1658609 │ 1658609 │ │ -│ │ │ 76.82 MB │ 76.82 MB │ 76.82 MB │ 76.82 MB │ │ +│ │ │ 1652075 │ 1652075 │ 1652075 │ 1652075 │ │ +│ │ │ 76.51 MB │ 76.51 MB │ 76.51 MB │ 76.51 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 1970117 │ 1970117 │ 1970117 │ 1970117 │ │ -│ │ │ 151.1 MB │ 151.1 MB │ 151.1 MB │ 151.1 MB │ │ +│ │ │ 1963583 │ 1963583 │ 1963583 │ 1963583 │ │ +│ │ │ 150.6 MB │ 150.6 MB │ 150.6 MB │ 150.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 309881 │ 309881 │ 309881 │ 309881 │ │ -│ │ │ 55.82 MB │ 55.82 MB │ 55.82 MB │ 55.82 MB │ │ +│ │ │ 308306 │ 308306 │ 308306 │ 308306 │ │ +│ │ │ 55.62 MB │ 55.62 MB │ 55.62 MB │ 55.62 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 73.17 ms │ 76.31 ms │ 74.15 ms │ 74.43 ms │ 100 │ 100 -│ │ 2.801 MB/s │ 2.686 MB/s │ 2.764 MB/s │ 2.754 MB/s │ │ +│ │ ╰─ commonmark_spec 72.37 ms │ 74.02 ms │ 72.62 ms │ 72.65 ms │ 100 │ 100 +│ │ 2.832 MB/s │ 2.769 MB/s │ 2.822 MB/s │ 2.821 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 750072 │ 750072 │ 750072 │ 750072 │ │ │ │ 34.96 MB │ 34.96 MB │ 34.96 MB │ 34.96 MB │ │ @@ -129,8 +129,8 @@ chunk_size fastest │ slowest │ median │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ tokenizers │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ ╰─ commonmark_spec 1.57 s │ 1.665 s │ 1.59 s │ 1.59 s │ 100 │ 100 -│ │ 130.5 KB/s │ 123 KB/s │ 128.8 KB/s │ 128.8 KB/s │ │ +│ │ ╰─ commonmark_spec 1.577 s │ 1.804 s │ 1.59 s │ 1.599 s │ 100 │ 100 +│ │ 129.9 KB/s │ 113.6 KB/s │ 128.8 KB/s │ 128.2 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 39101785 │ 39101785 │ 39101785 │ 39101785 │ │ │ │ 3.731 GB │ 3.731 GB │ 3.731 GB │ 3.731 GB │ │ @@ -144,8 +144,8 @@ chunk_size fastest │ slowest │ median │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 512 │ │ │ │ │ -│ │ ╰─ commonmark_spec 627.5 ms │ 668.4 ms │ 647 ms │ 645.6 ms │ 100 │ 100 -│ │ 326.7 KB/s │ 306.7 KB/s │ 316.8 KB/s │ 317.5 KB/s │ │ +│ │ ╰─ commonmark_spec 619.1 ms │ 701.6 ms │ 623.1 ms │ 627.1 ms │ 100 │ 100 +│ │ 331.1 KB/s │ 292.2 KB/s │ 329 KB/s │ 326.9 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 16074029 │ 16074029 │ 16074029 │ 16074029 │ │ │ │ 1.579 GB │ 1.579 GB │ 1.579 GB │ 1.579 GB │ │ @@ -159,23 +159,23 @@ chunk_size fastest │ slowest │ median │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 4096 │ │ │ │ │ -│ │ ╰─ commonmark_spec 336.7 ms │ 342.1 ms │ 338.2 ms │ 338.3 ms │ 100 │ 100 -│ │ 608.8 KB/s │ 599.2 KB/s │ 606.2 KB/s │ 605.9 KB/s │ │ +│ │ ╰─ commonmark_spec 325.7 ms │ 339.3 ms │ 327.3 ms │ 327.4 ms │ 100 │ 100 +│ │ 629.4 KB/s │ 604.2 KB/s │ 626.3 KB/s │ 626.1 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 8511827 │ 8511827 │ 8511827 │ 8511827 │ │ -│ │ 845.9 MB │ 845.9 MB │ 845.9 MB │ 845.9 MB │ │ +│ │ 8491159 │ 8491159 │ 8491159 │ 8491159 │ │ +│ │ 843.3 MB │ 843.3 MB │ 843.3 MB │ 843.3 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 8569880 │ 8569880 │ 8569880 │ 8569880 │ │ -│ │ 1.296 GB │ 1.296 GB │ 1.296 GB │ 1.296 GB │ │ +│ │ 8549212 │ 8549212 │ 8549212 │ 8549212 │ │ +│ │ 1.292 GB │ 1.292 GB │ 1.292 GB │ 1.292 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 168315 │ 168315 │ 168315 │ 168315 │ │ -│ │ 445.2 MB │ 445.2 MB │ 445.2 MB │ 445.2 MB │ │ +│ │ 168023 │ 168023 │ 168023 │ 168023 │ │ +│ │ 444.1 MB │ 444.1 MB │ 444.1 MB │ 444.1 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ 32768 │ │ │ │ │ -│ ╰─ commonmark_spec 180.4 ms │ 194 ms │ 188.1 ms │ 188 ms │ 100 │ 100 -│ 1.135 MB/s │ 1.056 MB/s │ 1.089 MB/s │ 1.09 MB/s │ │ +│ ╰─ commonmark_spec 179.8 ms │ 186 ms │ 181.5 ms │ 182.1 ms │ 100 │ 100 +│ 1.14 MB/s │ 1.101 MB/s │ 1.129 MB/s │ 1.125 MB/s │ │ │ alloc: │ │ │ │ │ │ 4579920 │ 4579920 │ 4579920 │ 4579920 │ │ │ 460.2 MB │ 460.2 MB │ 460.2 MB │ 460.2 MB │ │ @@ -191,22 +191,22 @@ chunk_size fastest │ slowest │ median ╰─ text │ │ │ │ │ ├─ characters │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 207.4 ms │ 209.3 ms │ 207.8 ms │ 207.9 ms │ 100 │ 100 - │ │ │ 788.8 KB/s │ 781.6 KB/s │ 787.1 KB/s │ 786.7 KB/s │ │ + │ │ ├─ romeo_and_juliet 207.5 ms │ 209.9 ms │ 207.8 ms │ 207.8 ms │ 100 │ 100 + │ │ │ 788.3 KB/s │ 779.4 KB/s │ 787.4 KB/s │ 787 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 11188 │ 11464 │ 11188 │ 11190 │ │ - │ │ │ 32.32 MB │ 32.34 MB │ 32.32 MB │ 32.32 MB │ │ + │ │ │ 11188 │ 11188 │ 11188 │ 11190 │ │ + │ │ │ 32.32 MB │ 32.32 MB │ 32.32 MB │ 32.32 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 11189 │ 11385 │ 11189 │ 11190 │ │ + │ │ │ 11189 │ 11189 │ 11189 │ 11190 │ │ │ │ │ 121.8 MB │ 121.8 MB │ 121.8 MB │ 121.8 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 33449 │ 33488 │ 33449 │ 33449 │ │ - │ │ │ 89.36 MB │ 89.37 MB │ 89.36 MB │ 89.36 MB │ │ + │ │ │ 33449 │ 33449 │ 33449 │ 33449 │ │ + │ │ │ 89.36 MB │ 89.36 MB │ 89.36 MB │ 89.36 MB │ │ │ │ │ shrink: │ │ │ │ │ - │ │ │ 0 │ 5 │ 0 │ 0.05 │ │ - │ │ │ 0 B │ 2.34 KB │ 0 B │ 23.4 B │ │ - │ │ ╰─ room_with_a_view 163.1 ms │ 164.1 ms │ 163.4 ms │ 163.4 ms │ 100 │ 100 - │ │ 1.85 MB/s │ 1.839 MB/s │ 1.846 MB/s │ 1.846 MB/s │ │ + │ │ │ 0 │ 0 │ 0 │ 0.05 │ │ + │ │ │ 0 B │ 0 B │ 0 B │ 23.4 B │ │ + │ │ ╰─ room_with_a_view 163.1 ms │ 165 ms │ 163.4 ms │ 163.5 ms │ 100 │ 100 + │ │ 1.85 MB/s │ 1.829 MB/s │ 1.846 MB/s │ 1.846 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 18430 │ 18430 │ 18430 │ 18430 │ │ │ │ 26.32 MB │ 26.32 MB │ 26.32 MB │ 26.32 MB │ │ @@ -217,8 +217,8 @@ chunk_size fastest │ slowest │ median │ │ 48815 │ 48815 │ 48815 │ 48815 │ │ │ │ 66.19 MB │ 66.19 MB │ 66.19 MB │ 66.19 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 24.72 ms │ 25.54 ms │ 24.87 ms │ 24.87 ms │ 100 │ 100 - │ │ │ 6.616 MB/s │ 6.405 MB/s │ 6.578 MB/s │ 6.577 MB/s │ │ + │ │ ├─ romeo_and_juliet 24.71 ms │ 25.77 ms │ 24.87 ms │ 24.89 ms │ 100 │ 100 + │ │ │ 6.619 MB/s │ 6.349 MB/s │ 6.578 MB/s │ 6.572 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 1200 │ 1200 │ 1200 │ 1200 │ │ │ │ │ 3.479 MB │ 3.479 MB │ 3.479 MB │ 3.479 MB │ │ @@ -228,8 +228,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 3595 │ 3595 │ 3595 │ 3595 │ │ │ │ │ 9.941 MB │ 9.941 MB │ 9.941 MB │ 9.941 MB │ │ - │ │ ╰─ room_with_a_view 25.33 ms │ 26.04 ms │ 25.44 ms │ 25.45 ms │ 100 │ 100 - │ │ 11.91 MB/s │ 11.59 MB/s │ 11.86 MB/s │ 11.86 MB/s │ │ + │ │ ╰─ room_with_a_view 25.47 ms │ 26.45 ms │ 25.59 ms │ 25.61 ms │ 100 │ 100 + │ │ 11.84 MB/s │ 11.41 MB/s │ 11.79 MB/s │ 11.78 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 2349 │ 2349 │ 2349 │ 2349 │ │ │ │ 3.353 MB │ 3.353 MB │ 3.353 MB │ 3.353 MB │ │ @@ -240,8 +240,8 @@ chunk_size fastest │ slowest │ median │ │ 6219 │ 6219 │ 6219 │ 6219 │ │ │ │ 8.522 MB │ 8.522 MB │ 8.522 MB │ 8.522 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 3.675 ms │ 3.99 ms │ 3.767 ms │ 3.769 ms │ 100 │ 100 - │ │ │ 44.52 MB/s │ 41 MB/s │ 43.42 MB/s │ 43.4 MB/s │ │ + │ │ ├─ romeo_and_juliet 3.655 ms │ 4.162 ms │ 3.747 ms │ 3.763 ms │ 100 │ 100 + │ │ │ 44.76 MB/s │ 39.31 MB/s │ 43.66 MB/s │ 43.47 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 141 │ 141 │ 141 │ 141 │ │ │ │ │ 406.1 KB │ 406.1 KB │ 406.1 KB │ 406.1 KB │ │ @@ -251,8 +251,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 426 │ 426 │ 426 │ 426 │ │ │ │ │ 1.42 MB │ 1.42 MB │ 1.42 MB │ 1.42 MB │ │ - │ │ ╰─ room_with_a_view 4.963 ms │ 5.236 ms │ 5.05 ms │ 5.044 ms │ 100 │ 100 - │ │ 60.82 MB/s │ 57.65 MB/s │ 59.78 MB/s │ 59.84 MB/s │ │ + │ │ ╰─ room_with_a_view 4.938 ms │ 5.272 ms │ 5.015 ms │ 5.019 ms │ 100 │ 100 + │ │ 61.13 MB/s │ 57.26 MB/s │ 60.19 MB/s │ 60.14 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 304 │ 304 │ 304 │ 304 │ │ │ │ 430 KB │ 430 KB │ 430 KB │ 430 KB │ │ @@ -263,8 +263,8 @@ chunk_size fastest │ slowest │ median │ │ 812 │ 812 │ 812 │ 812 │ │ │ │ 1.154 MB │ 1.154 MB │ 1.154 MB │ 1.154 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.502 ms │ 1.655 ms │ 1.585 ms │ 1.583 ms │ 100 │ 100 - │ │ 108.8 MB/s │ 98.84 MB/s │ 103.2 MB/s │ 103.3 MB/s │ │ + │ ├─ romeo_and_juliet 1.515 ms │ 1.916 ms │ 1.592 ms │ 1.599 ms │ 100 │ 100 + │ │ 107.9 MB/s │ 85.38 MB/s │ 102.7 MB/s │ 102.2 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 33 │ 33 │ 33 │ 33 │ │ │ │ 107 KB │ 107 KB │ 107 KB │ 107 KB │ │ @@ -274,8 +274,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 107 │ 107 │ 107 │ 107 │ │ │ │ 597.6 KB │ 597.6 KB │ 597.6 KB │ 597.6 KB │ │ - │ ╰─ room_with_a_view 1.755 ms │ 1.897 ms │ 1.837 ms │ 1.834 ms │ 100 │ 100 - │ 171.9 MB/s │ 159.1 MB/s │ 164.3 MB/s │ 164.6 MB/s │ │ + │ ╰─ room_with_a_view 1.762 ms │ 1.928 ms │ 1.821 ms │ 1.825 ms │ 100 │ 100 + │ 171.3 MB/s │ 156.5 MB/s │ 165.7 MB/s │ 165.3 MB/s │ │ │ alloc: │ │ │ │ │ │ 42 │ 42 │ 42 │ 42 │ │ │ 56.02 KB │ 56.02 KB │ 56.02 KB │ 56.02 KB │ │ @@ -287,8 +287,8 @@ chunk_size fastest │ slowest │ median │ 206.4 KB │ 206.4 KB │ 206.4 KB │ 206.4 KB │ │ ├─ tiktoken │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 802.3 ms │ 828.6 ms │ 812.9 ms │ 814.6 ms │ 100 │ 100 - │ │ │ 203.9 KB/s │ 197.4 KB/s │ 201.2 KB/s │ 200.8 KB/s │ │ + │ │ ├─ romeo_and_juliet 794.7 ms │ 823.4 ms │ 802.7 ms │ 803.1 ms │ 100 │ 100 + │ │ │ 205.8 KB/s │ 198.7 KB/s │ 203.8 KB/s │ 203.7 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 8687901 │ 8687901 │ 8687901 │ 8687901 │ │ │ │ │ 413.2 MB │ 413.2 MB │ 413.2 MB │ 413.2 MB │ │ @@ -298,8 +298,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 1801556 │ 1801556 │ 1801556 │ 1801556 │ │ │ │ │ 245.2 MB │ 245.2 MB │ 245.2 MB │ 245.2 MB │ │ - │ │ ╰─ room_with_a_view 1.064 s │ 1.153 s │ 1.072 s │ 1.077 s │ 100 │ 100 - │ │ 283.6 KB/s │ 261.8 KB/s │ 281.4 KB/s │ 280.1 KB/s │ │ + │ │ ╰─ room_with_a_view 1.051 s │ 1.381 s │ 1.062 s │ 1.065 s │ 100 │ 100 + │ │ 287 KB/s │ 218.6 KB/s │ 284.1 KB/s │ 283.3 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 11500303 │ 11500303 │ 11500303 │ 11500303 │ │ │ │ 551.9 MB │ 551.9 MB │ 551.9 MB │ 551.9 MB │ │ @@ -310,8 +310,8 @@ chunk_size fastest │ slowest │ median │ │ 2834270 │ 2834270 │ 2834270 │ 2834270 │ │ │ │ 370.6 MB │ 370.6 MB │ 370.6 MB │ 370.6 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 262.6 ms │ 268.1 ms │ 264.2 ms │ 264.4 ms │ 100 │ 100 - │ │ │ 622.8 KB/s │ 610.3 KB/s │ 619.1 KB/s │ 618.8 KB/s │ │ + │ │ ├─ romeo_and_juliet 258.8 ms │ 266.9 ms │ 260.3 ms │ 260.6 ms │ 100 │ 100 + │ │ │ 632 KB/s │ 612.9 KB/s │ 628.4 KB/s │ 627.8 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 2921740 │ 2921740 │ 2921740 │ 2921740 │ │ │ │ │ 137.3 MB │ 137.3 MB │ 137.3 MB │ 137.3 MB │ │ @@ -321,8 +321,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 606069 │ 606069 │ 606069 │ 606069 │ │ │ │ │ 78.61 MB │ 78.61 MB │ 78.61 MB │ 78.61 MB │ │ - │ │ ╰─ room_with_a_view 438.5 ms │ 460.3 ms │ 443.3 ms │ 444.8 ms │ 100 │ 100 - │ │ 688.4 KB/s │ 655.8 KB/s │ 681 KB/s │ 678.7 KB/s │ │ + │ │ ╰─ room_with_a_view 436.4 ms │ 445.7 ms │ 439.1 ms │ 439.3 ms │ 100 │ 100 + │ │ 691.7 KB/s │ 677.3 KB/s │ 687.4 KB/s │ 687.2 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 4881128 │ 4881128 │ 4881128 │ 4881128 │ │ │ │ 232.3 MB │ 232.3 MB │ 232.3 MB │ 232.3 MB │ │ @@ -333,8 +333,8 @@ chunk_size fastest │ slowest │ median │ │ 1197369 │ 1197369 │ 1197369 │ 1197369 │ │ │ │ 152.9 MB │ 152.9 MB │ 152.9 MB │ 152.9 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 178.2 ms │ 182.5 ms │ 179.4 ms │ 179.5 ms │ 100 │ 100 - │ │ │ 917.7 KB/s │ 896.2 KB/s │ 911.6 KB/s │ 911.4 KB/s │ │ + │ │ ├─ romeo_and_juliet 177 ms │ 180.1 ms │ 178 ms │ 178 ms │ 100 │ 100 + │ │ │ 923.9 KB/s │ 908.1 KB/s │ 918.8 KB/s │ 918.9 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 2018346 │ 2018346 │ 2018346 │ 2018346 │ │ │ │ │ 94.51 MB │ 94.51 MB │ 94.51 MB │ 94.51 MB │ │ @@ -344,8 +344,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 418451 │ 418451 │ 418451 │ 418451 │ │ │ │ │ 53.36 MB │ 53.36 MB │ 53.36 MB │ 53.36 MB │ │ - │ │ ╰─ room_with_a_view 319.3 ms │ 326.4 ms │ 323 ms │ 323 ms │ 100 │ 100 - │ │ 945.3 KB/s │ 924.8 KB/s │ 934.4 KB/s │ 934.4 KB/s │ │ + │ │ ╰─ room_with_a_view 317.7 ms │ 335.2 ms │ 320 ms │ 320.6 ms │ 100 │ 100 + │ │ 950.1 KB/s │ 900.6 KB/s │ 943.2 KB/s │ 941.4 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 3573121 │ 3573121 │ 3573121 │ 3573121 │ │ │ │ 169.7 MB │ 169.7 MB │ 169.7 MB │ 169.7 MB │ │ @@ -356,8 +356,8 @@ chunk_size fastest │ slowest │ median │ │ 874507 │ 874507 │ 874507 │ 874507 │ │ │ │ 110.8 MB │ 110.8 MB │ 110.8 MB │ 110.8 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 81.77 ms │ 89.6 ms │ 82.34 ms │ 82.77 ms │ 100 │ 100 - │ │ 2 MB/s │ 1.826 MB/s │ 1.987 MB/s │ 1.976 MB/s │ │ + │ ├─ romeo_and_juliet 81.79 ms │ 84.17 ms │ 82.27 ms │ 82.31 ms │ 100 │ 100 + │ │ 2 MB/s │ 1.943 MB/s │ 1.988 MB/s │ 1.987 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 914681 │ 914681 │ 914681 │ 914681 │ │ │ │ 42.86 MB │ 42.86 MB │ 42.86 MB │ 42.86 MB │ │ @@ -367,8 +367,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 187707 │ 187707 │ 187707 │ 187707 │ │ │ │ 24.37 MB │ 24.37 MB │ 24.37 MB │ 24.37 MB │ │ - │ ╰─ room_with_a_view 112.1 ms │ 117.4 ms │ 112.9 ms │ 113.3 ms │ 100 │ 100 - │ 2.691 MB/s │ 2.57 MB/s │ 2.672 MB/s │ 2.663 MB/s │ │ + │ ╰─ room_with_a_view 111.9 ms │ 119.1 ms │ 112.8 ms │ 112.9 ms │ 100 │ 100 + │ 2.696 MB/s │ 2.533 MB/s │ 2.675 MB/s │ 2.671 MB/s │ │ │ alloc: │ │ │ │ │ │ 1232443 │ 1232443 │ 1232443 │ 1232443 │ │ │ 58.6 MB │ 58.6 MB │ 58.6 MB │ 58.6 MB │ │ @@ -380,8 +380,8 @@ chunk_size fastest │ slowest │ median │ 38.19 MB │ 38.19 MB │ 38.19 MB │ 38.19 MB │ │ ╰─ tokenizers │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.381 s │ 1.448 s │ 1.404 s │ 1.403 s │ 100 │ 100 - │ │ 118.4 KB/s │ 112.9 KB/s │ 116.4 KB/s │ 116.5 KB/s │ │ + │ ├─ romeo_and_juliet 1.39 s │ 1.502 s │ 1.406 s │ 1.41 s │ 100 │ 100 + │ │ 117.6 KB/s │ 108.8 KB/s │ 116.3 KB/s │ 116 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 29188728 │ 29188728 │ 29188728 │ 29188728 │ │ │ │ 3.601 GB │ 3.601 GB │ 3.601 GB │ 3.601 GB │ │ @@ -391,8 +391,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 463032 │ 463032 │ 463032 │ 463032 │ │ │ │ 1.608 GB │ 1.608 GB │ 1.608 GB │ 1.608 GB │ │ - │ ╰─ room_with_a_view 1.906 s │ 1.946 s │ 1.92 s │ 1.922 s │ 100 │ 100 - │ 158.3 KB/s │ 155.1 KB/s │ 157.2 KB/s │ 157 KB/s │ │ + │ ╰─ room_with_a_view 1.933 s │ 2.107 s │ 1.943 s │ 1.946 s │ 100 │ 100 + │ 156.1 KB/s │ 143.2 KB/s │ 155.3 KB/s │ 155 KB/s │ │ │ alloc: │ │ │ │ │ │ 39390416 │ 39390416 │ 39390416 │ 39390416 │ │ │ 5.158 GB │ 5.158 GB │ 5.158 GB │ 5.158 GB │ │ @@ -403,8 +403,8 @@ chunk_size fastest │ slowest │ median │ 681205 │ 681205 │ 681205 │ 681205 │ │ │ 2.257 GB │ 2.257 GB │ 2.257 GB │ 2.257 GB │ │ ├─ 512 │ │ │ │ │ - │ ├─ romeo_and_juliet 430.1 ms │ 497.1 ms │ 439.3 ms │ 442.9 ms │ 100 │ 100 - │ │ 380.3 KB/s │ 329.1 KB/s │ 372.4 KB/s │ 369.3 KB/s │ │ + │ ├─ romeo_and_juliet 432.2 ms │ 443 ms │ 437.7 ms │ 437.4 ms │ 100 │ 100 + │ │ 378.5 KB/s │ 369.2 KB/s │ 373.8 KB/s │ 374 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 9331641 │ 9331641 │ 9331641 │ 9331641 │ │ │ │ 1.159 GB │ 1.159 GB │ 1.159 GB │ 1.159 GB │ │ @@ -414,8 +414,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 100059 │ 100059 │ 100059 │ 100059 │ │ │ │ 513.7 MB │ 513.7 MB │ 513.7 MB │ 513.7 MB │ │ - │ ╰─ room_with_a_view 786.4 ms │ 882.8 ms │ 791.5 ms │ 796.9 ms │ 100 │ 100 - │ 383.8 KB/s │ 341.9 KB/s │ 381.3 KB/s │ 378.8 KB/s │ │ + │ ╰─ room_with_a_view 793.5 ms │ 831.4 ms │ 798.5 ms │ 800.2 ms │ 100 │ 100 + │ 380.4 KB/s │ 363.1 KB/s │ 378 KB/s │ 377.2 KB/s │ │ │ alloc: │ │ │ │ │ │ 16335240 │ 16335240 │ 16335240 │ 16335240 │ │ │ 2.154 GB │ 2.154 GB │ 2.154 GB │ 2.154 GB │ │ @@ -426,8 +426,8 @@ chunk_size fastest │ slowest │ median │ 165455 │ 165455 │ 165455 │ 165455 │ │ │ 945.8 MB │ 945.8 MB │ 945.8 MB │ 945.8 MB │ │ ├─ 4096 │ │ │ │ │ - │ ├─ romeo_and_juliet 294.7 ms │ 301.6 ms │ 295.5 ms │ 296.4 ms │ 100 │ 100 - │ │ 555 KB/s │ 542.3 KB/s │ 553.6 KB/s │ 551.8 KB/s │ │ + │ ├─ romeo_and_juliet 297.7 ms │ 300.7 ms │ 298.6 ms │ 298.7 ms │ 100 │ 100 + │ │ 549.5 KB/s │ 544 KB/s │ 547.9 KB/s │ 547.6 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 6432740 │ 6432740 │ 6432740 │ 6432740 │ │ │ │ 802.5 MB │ 802.5 MB │ 802.5 MB │ 802.5 MB │ │ @@ -437,8 +437,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 34250 │ 34250 │ 34250 │ 34250 │ │ │ │ 356.8 MB │ 356.8 MB │ 356.8 MB │ 356.8 MB │ │ - │ ╰─ room_with_a_view 550.8 ms │ 580.6 ms │ 565.2 ms │ 561.8 ms │ 100 │ 100 - │ 548.1 KB/s │ 519.9 KB/s │ 534.1 KB/s │ 537.3 KB/s │ │ + │ ╰─ room_with_a_view 560.3 ms │ 575.2 ms │ 565.8 ms │ 565.8 ms │ 100 │ 100 + │ 538.8 KB/s │ 524.8 KB/s │ 533.5 KB/s │ 533.5 KB/s │ │ │ alloc: │ │ │ │ │ │ 11604279 │ 11604279 │ 11604279 │ 11604279 │ │ │ 1.536 GB │ 1.536 GB │ 1.536 GB │ 1.536 GB │ │ @@ -449,8 +449,8 @@ chunk_size fastest │ slowest │ median │ 55754 │ 55754 │ 55754 │ 55754 │ │ │ 674.3 MB │ 674.3 MB │ 674.3 MB │ 674.3 MB │ │ ╰─ 32768 │ │ │ │ │ - ├─ romeo_and_juliet 130.7 ms │ 133.1 ms │ 131.2 ms │ 131.2 ms │ 100 │ 100 - │ 1.251 MB/s │ 1.228 MB/s │ 1.246 MB/s │ 1.246 MB/s │ │ + ├─ romeo_and_juliet 131 ms │ 134.2 ms │ 131.6 ms │ 131.7 ms │ 100 │ 100 + │ 1.248 MB/s │ 1.218 MB/s │ 1.242 MB/s │ 1.241 MB/s │ │ │ alloc: │ │ │ │ │ │ 2845252 │ 2845252 │ 2845252 │ 2845252 │ │ │ 353.7 MB │ 353.7 MB │ 353.7 MB │ 353.7 MB │ │ @@ -460,8 +460,8 @@ chunk_size fastest │ slowest │ median │ grow: │ │ │ │ │ │ 9650 │ 9650 │ 9650 │ 9650 │ │ │ 158.8 MB │ 158.8 MB │ 158.8 MB │ 158.8 MB │ │ - ╰─ room_with_a_view 215.9 ms │ 222.8 ms │ 218.8 ms │ 218.4 ms │ 100 │ 100 - 1.398 MB/s │ 1.354 MB/s │ 1.379 MB/s │ 1.382 MB/s │ │ + ╰─ room_with_a_view 218.2 ms │ 230.2 ms │ 219.1 ms │ 219.4 ms │ 100 │ 100 + 1.383 MB/s │ 1.311 MB/s │ 1.377 MB/s │ 1.375 MB/s │ │ alloc: │ │ │ │ │ 4490084 │ 4490084 │ 4490084 │ 4490084 │ │ 594.3 MB │ 594.3 MB │ 594.3 MB │ 594.3 MB │ │ diff --git a/src/lib.rs b/src/lib.rs index c9e7c14..f94b988 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -482,15 +482,13 @@ where self.levels_in_remaining_text .extend(self.semantic_split.levels_in_remaining_text(self.cursor)); // Get starting level - let Some(mut semantic_level) = self.levels_in_remaining_text.first().copied() else { - return; - }; + let mut semantic_level = self.levels_in_remaining_text[0]; // If we aren't at the highest semantic level, stop iterating sections that go beyond the range of the next level. let mut max_encoded_offset = None; let remaining_text = self.text.get(self.cursor..).unwrap(); - for i in 0..self.levels_in_remaining_text.len() { + for i in 1..self.levels_in_remaining_text.len() { let level = self.levels_in_remaining_text[i]; let Some((_, str)) = self .semantic_split @@ -500,9 +498,8 @@ where return; }; let chunk_size = self.check_capacity(self.cursor, str); - // If this no longer fits, we use the level we are at. Or if we already - // have the rest of the string - if chunk_size.fits.is_gt() || remaining_text == str { + // If this no longer fits, we use the level we are at. + if chunk_size.fits.is_gt() { max_encoded_offset = chunk_size.max_chunk_size_offset; break; } From 957eb53f77e9a3af2cbaa534aff7c399d38fcbb1 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Mon, 25 Mar 2024 13:32:22 +0100 Subject: [PATCH 4/6] Remove extra allocation for ranges --- benches/output.txt | 376 ++++++++++++++++++++++----------------------- src/lib.rs | 109 +++++++------ 2 files changed, 241 insertions(+), 244 deletions(-) diff --git a/benches/output.txt b/benches/output.txt index b10667f..87173d0 100644 --- a/benches/output.txt +++ b/benches/output.txt @@ -7,183 +7,183 @@ chunk_size fastest │ slowest │ median ├─ markdown │ │ │ │ │ │ ├─ characters │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 350.7 ms │ 377.3 ms │ 352.4 ms │ 352.7 ms │ 100 │ 100 -│ │ │ 584.5 KB/s │ 543.3 KB/s │ 581.7 KB/s │ 581.1 KB/s │ │ +│ │ │ ╰─ commonmark_spec 329.8 ms │ 360.1 ms │ 332.6 ms │ 333.2 ms │ 100 │ 100 +│ │ │ 621.5 KB/s │ 569.2 KB/s │ 616.3 KB/s │ 615.2 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 13270 │ 0 │ 13270 │ 13137 │ │ +│ │ │ 13269 │ 0 │ 13269 │ 13136 │ │ │ │ │ 75.99 MB │ 0 B │ 75.99 MB │ 75.23 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 13271 │ 0 │ 13271 │ 13138 │ │ +│ │ │ 13270 │ 0 │ 13270 │ 13137 │ │ │ │ │ 281.4 MB │ 0 B │ 281.4 MB │ 278.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 44735 │ 0 │ 44735 │ 44287 │ │ +│ │ │ 44733 │ 0 │ 44733 │ 44285 │ │ │ │ │ 205.2 MB │ 0 B │ 205.2 MB │ 203.2 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 0 │ 13 │ 12.87 │ │ │ │ │ 94 B │ 0 B │ 94 B │ 93.06 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 47.85 ms │ 48.93 ms │ 48.15 ms │ 48.17 ms │ 100 │ 100 -│ │ │ 4.284 MB/s │ 4.189 MB/s │ 4.258 MB/s │ 4.256 MB/s │ │ +│ │ │ ╰─ commonmark_spec 45.92 ms │ 46.7 ms │ 46.12 ms │ 46.15 ms │ 100 │ 100 +│ │ │ 4.464 MB/s │ 4.389 MB/s │ 4.444 MB/s │ 4.441 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 1576 │ 1576 │ 1576 │ 1576 │ │ +│ │ │ 1575 │ 1575 │ 1575 │ 1575 │ │ │ │ │ 9.241 MB │ 9.241 MB │ 9.241 MB │ 9.241 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 1577 │ 1577 │ 1577 │ 1577 │ │ +│ │ │ 1576 │ 1576 │ 1576 │ 1576 │ │ │ │ │ 34.52 MB │ 34.52 MB │ 34.52 MB │ 34.52 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 5255 │ 5255 │ 5255 │ 5255 │ │ +│ │ │ 5253 │ 5253 │ 5253 │ 5253 │ │ │ │ │ 25.08 MB │ 25.08 MB │ 25.08 MB │ 25.08 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 8.767 ms │ 9.241 ms │ 8.87 ms │ 8.88 ms │ 100 │ 100 -│ │ │ 23.38 MB/s │ 22.18 MB/s │ 23.11 MB/s │ 23.08 MB/s │ │ +│ │ │ ╰─ commonmark_spec 8.379 ms │ 8.773 ms │ 8.426 ms │ 8.433 ms │ 100 │ 100 +│ │ │ 24.46 MB/s │ 23.36 MB/s │ 24.33 MB/s │ 24.31 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 261 │ 261 │ 261 │ 261 │ │ +│ │ │ 260 │ 260 │ 260 │ 260 │ │ │ │ │ 1.663 MB │ 1.663 MB │ 1.663 MB │ 1.663 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 262 │ 262 │ 262 │ 262 │ │ +│ │ │ 261 │ 261 │ 261 │ 261 │ │ │ │ │ 6.346 MB │ 6.346 MB │ 6.346 MB │ 6.346 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 814 │ 814 │ 814 │ 814 │ │ +│ │ │ 812 │ 812 │ 812 │ 812 │ │ │ │ │ 4.478 MB │ 4.478 MB │ 4.478 MB │ 4.478 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 2.039 ms │ 2.24 ms │ 2.104 ms │ 2.112 ms │ 100 │ 100 -│ │ 100.5 MB/s │ 91.49 MB/s │ 97.41 MB/s │ 97.07 MB/s │ │ +│ │ ╰─ commonmark_spec 1.946 ms │ 2.138 ms │ 1.956 ms │ 1.965 ms │ 100 │ 100 +│ │ 105.3 MB/s │ 95.87 MB/s │ 104.7 MB/s │ 104.3 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 66 │ 66 │ 66 │ 66 │ │ +│ │ 65 │ 65 │ 65 │ 65 │ │ │ │ 528.4 KB │ 528.4 KB │ 528.4 KB │ 528.4 KB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 67 │ 67 │ 67 │ 67 │ │ +│ │ 66 │ 66 │ 66 │ 66 │ │ │ │ 2.122 MB │ 2.122 MB │ 2.122 MB │ 2.122 MB │ │ │ │ grow: │ │ │ │ │ -│ │ 150 │ 150 │ 150 │ 150 │ │ +│ │ 148 │ 148 │ 148 │ 148 │ │ │ │ 1.388 MB │ 1.388 MB │ 1.388 MB │ 1.388 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ tiktoken │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 863.1 ms │ 876.3 ms │ 866.4 ms │ 866.7 ms │ 100 │ 100 -│ │ │ 237.5 KB/s │ 233.9 KB/s │ 236.6 KB/s │ 236.5 KB/s │ │ +│ │ │ ╰─ commonmark_spec 832.7 ms │ 927.9 ms │ 868.5 ms │ 864.8 ms │ 100 │ 100 +│ │ │ 246.2 KB/s │ 220.9 KB/s │ 236 KB/s │ 237 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 8103002 │ 8103002 │ 8103002 │ 8103002 │ │ +│ │ │ 8103001 │ 8103001 │ 8103001 │ 8103001 │ │ │ │ │ 394.3 MB │ 394.3 MB │ 394.3 MB │ 394.3 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 8414510 │ 8414510 │ 8414510 │ 8414510 │ │ +│ │ │ 8414509 │ 8414509 │ 8414509 │ 8414509 │ │ │ │ │ 745 MB │ 745 MB │ 745 MB │ 745 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 1466110 │ 1466110 │ 1466110 │ 1466110 │ │ +│ │ │ 1466108 │ 1466108 │ 1466108 │ 1466108 │ │ │ │ │ 332.1 MB │ 332.1 MB │ 332.1 MB │ 332.1 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 287.8 ms │ 293.5 ms │ 289.3 ms │ 289.3 ms │ 100 │ 100 -│ │ │ 712.3 KB/s │ 698.5 KB/s │ 708.5 KB/s │ 708.5 KB/s │ │ +│ │ │ ╰─ commonmark_spec 290.2 ms │ 292.4 ms │ 291 ms │ 291 ms │ 100 │ 100 +│ │ │ 706.4 KB/s │ 700.9 KB/s │ 704.5 KB/s │ 704.4 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 2947758 │ 2947758 │ 2947758 │ 2947758 │ │ +│ │ │ 2947757 │ 2947757 │ 2947757 │ 2947757 │ │ │ │ │ 138.2 MB │ 138.2 MB │ 138.2 MB │ 138.2 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 3259266 │ 3259266 │ 3259266 │ 3259266 │ │ +│ │ │ 3259265 │ 3259265 │ 3259265 │ 3259265 │ │ │ │ │ 260.4 MB │ 260.4 MB │ 260.4 MB │ 260.4 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 549666 │ 549666 │ 549666 │ 549666 │ │ +│ │ │ 549664 │ 549664 │ 549664 │ 549664 │ │ │ │ │ 103.6 MB │ 103.6 MB │ 103.6 MB │ 103.6 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 156.1 ms │ 159.7 ms │ 156.8 ms │ 156.8 ms │ 100 │ 100 -│ │ │ 1.312 MB/s │ 1.283 MB/s │ 1.307 MB/s │ 1.306 MB/s │ │ +│ │ │ ╰─ commonmark_spec 157.4 ms │ 165.7 ms │ 158.3 ms │ 158.7 ms │ 100 │ 100 +│ │ │ 1.302 MB/s │ 1.236 MB/s │ 1.294 MB/s │ 1.291 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 1652075 │ 1652075 │ 1652075 │ 1652075 │ │ +│ │ │ 1652074 │ 1652074 │ 1652074 │ 1652074 │ │ │ │ │ 76.51 MB │ 76.51 MB │ 76.51 MB │ 76.51 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 1963583 │ 1963583 │ 1963583 │ 1963583 │ │ +│ │ │ 1963582 │ 1963582 │ 1963582 │ 1963582 │ │ │ │ │ 150.6 MB │ 150.6 MB │ 150.6 MB │ 150.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 308306 │ 308306 │ 308306 │ 308306 │ │ +│ │ │ 308304 │ 308304 │ 308304 │ 308304 │ │ │ │ │ 55.62 MB │ 55.62 MB │ 55.62 MB │ 55.62 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 72.37 ms │ 74.02 ms │ 72.62 ms │ 72.65 ms │ 100 │ 100 -│ │ 2.832 MB/s │ 2.769 MB/s │ 2.822 MB/s │ 2.821 MB/s │ │ +│ │ ╰─ commonmark_spec 72.99 ms │ 81.69 ms │ 73.33 ms │ 73.45 ms │ 100 │ 100 +│ │ 2.808 MB/s │ 2.509 MB/s │ 2.795 MB/s │ 2.791 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 750072 │ 750072 │ 750072 │ 750072 │ │ +│ │ 750071 │ 750071 │ 750071 │ 750071 │ │ │ │ 34.96 MB │ 34.96 MB │ 34.96 MB │ 34.96 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 1061580 │ 1061580 │ 1061580 │ 1061580 │ │ +│ │ 1061579 │ 1061579 │ 1061579 │ 1061579 │ │ │ │ 78.87 MB │ 78.87 MB │ 78.87 MB │ 78.87 MB │ │ │ │ grow: │ │ │ │ │ -│ │ 141697 │ 141697 │ 141697 │ 141697 │ │ +│ │ 141695 │ 141695 │ 141695 │ 141695 │ │ │ │ 25.39 MB │ 25.39 MB │ 25.39 MB │ 25.39 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ tokenizers │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ ╰─ commonmark_spec 1.577 s │ 1.804 s │ 1.59 s │ 1.599 s │ 100 │ 100 -│ │ 129.9 KB/s │ 113.6 KB/s │ 128.8 KB/s │ 128.2 KB/s │ │ +│ │ ╰─ commonmark_spec 1.57 s │ 1.712 s │ 1.586 s │ 1.59 s │ 100 │ 100 +│ │ 130.5 KB/s │ 119.7 KB/s │ 129.2 KB/s │ 128.9 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 39101785 │ 39101785 │ 39101785 │ 39101785 │ │ +│ │ 39101784 │ 39101784 │ 39101784 │ 39101784 │ │ │ │ 3.731 GB │ 3.731 GB │ 3.731 GB │ 3.731 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 39159838 │ 39159838 │ 39159838 │ 39159838 │ │ +│ │ 39159837 │ 39159837 │ 39159837 │ 39159837 │ │ │ │ 5.853 GB │ 5.853 GB │ 5.853 GB │ 5.853 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 1329521 │ 1329521 │ 1329521 │ 1329521 │ │ +│ │ 1329519 │ 1329519 │ 1329519 │ 1329519 │ │ │ │ 2.117 GB │ 2.117 GB │ 2.117 GB │ 2.117 GB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 512 │ │ │ │ │ -│ │ ╰─ commonmark_spec 619.1 ms │ 701.6 ms │ 623.1 ms │ 627.1 ms │ 100 │ 100 -│ │ 331.1 KB/s │ 292.2 KB/s │ 329 KB/s │ 326.9 KB/s │ │ +│ │ ╰─ commonmark_spec 621.9 ms │ 652.9 ms │ 630.6 ms │ 630.4 ms │ 100 │ 100 +│ │ 329.6 KB/s │ 314 KB/s │ 325.1 KB/s │ 325.1 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 16074029 │ 16074029 │ 16074029 │ 16074029 │ │ +│ │ 16074028 │ 16074028 │ 16074028 │ 16074028 │ │ │ │ 1.579 GB │ 1.579 GB │ 1.579 GB │ 1.579 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 16132082 │ 16132082 │ 16132082 │ 16132082 │ │ +│ │ 16132081 │ 16132081 │ 16132081 │ 16132081 │ │ │ │ 2.438 GB │ 2.438 GB │ 2.438 GB │ 2.438 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 392881 │ 392881 │ 392881 │ 392881 │ │ +│ │ 392879 │ 392879 │ 392879 │ 392879 │ │ │ │ 854.6 MB │ 854.6 MB │ 854.6 MB │ 854.6 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 4096 │ │ │ │ │ -│ │ ╰─ commonmark_spec 325.7 ms │ 339.3 ms │ 327.3 ms │ 327.4 ms │ 100 │ 100 -│ │ 629.4 KB/s │ 604.2 KB/s │ 626.3 KB/s │ 626.1 KB/s │ │ +│ │ ╰─ commonmark_spec 322.5 ms │ 329.3 ms │ 325 ms │ 325 ms │ 100 │ 100 +│ │ 635.6 KB/s │ 622.5 KB/s │ 630.6 KB/s │ 630.6 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 8491159 │ 8491159 │ 8491159 │ 8491159 │ │ +│ │ 8491158 │ 8491158 │ 8491158 │ 8491158 │ │ │ │ 843.3 MB │ 843.3 MB │ 843.3 MB │ 843.3 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 8549212 │ 8549212 │ 8549212 │ 8549212 │ │ +│ │ 8549211 │ 8549211 │ 8549211 │ 8549211 │ │ │ │ 1.292 GB │ 1.292 GB │ 1.292 GB │ 1.292 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 168023 │ 168023 │ 168023 │ 168023 │ │ +│ │ 168021 │ 168021 │ 168021 │ 168021 │ │ │ │ 444.1 MB │ 444.1 MB │ 444.1 MB │ 444.1 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ 32768 │ │ │ │ │ -│ ╰─ commonmark_spec 179.8 ms │ 186 ms │ 181.5 ms │ 182.1 ms │ 100 │ 100 -│ 1.14 MB/s │ 1.101 MB/s │ 1.129 MB/s │ 1.125 MB/s │ │ +│ ╰─ commonmark_spec 177.1 ms │ 184.9 ms │ 178.2 ms │ 179.3 ms │ 100 │ 100 +│ 1.157 MB/s │ 1.108 MB/s │ 1.149 MB/s │ 1.143 MB/s │ │ │ alloc: │ │ │ │ │ -│ 4579920 │ 4579920 │ 4579920 │ 4579920 │ │ +│ 4579919 │ 4579919 │ 4579919 │ 4579919 │ │ │ 460.2 MB │ 460.2 MB │ 460.2 MB │ 460.2 MB │ │ │ dealloc: │ │ │ │ │ -│ 4637973 │ 4637973 │ 4637973 │ 4637973 │ │ +│ 4637972 │ 4637972 │ 4637972 │ 4637972 │ │ │ 698.6 MB │ 698.6 MB │ 698.6 MB │ 698.6 MB │ │ │ grow: │ │ │ │ │ -│ 79600 │ 79600 │ 79600 │ 79600 │ │ +│ 79598 │ 79598 │ 79598 │ 79598 │ │ │ 233.4 MB │ 233.4 MB │ 233.4 MB │ 233.4 MB │ │ │ shrink: │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ @@ -191,284 +191,284 @@ chunk_size fastest │ slowest │ median ╰─ text │ │ │ │ │ ├─ characters │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 207.5 ms │ 209.9 ms │ 207.8 ms │ 207.8 ms │ 100 │ 100 - │ │ │ 788.3 KB/s │ 779.4 KB/s │ 787.4 KB/s │ 787 KB/s │ │ + │ │ ├─ romeo_and_juliet 208.2 ms │ 222 ms │ 208.9 ms │ 209.4 ms │ 100 │ 100 + │ │ │ 785.6 KB/s │ 736.9 KB/s │ 783 KB/s │ 781.3 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 11188 │ 11188 │ 11188 │ 11190 │ │ + │ │ │ 11187 │ 11187 │ 11187 │ 11189 │ │ │ │ │ 32.32 MB │ 32.32 MB │ 32.32 MB │ 32.32 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 11189 │ 11189 │ 11189 │ 11190 │ │ + │ │ │ 11188 │ 11188 │ 11188 │ 11189 │ │ │ │ │ 121.8 MB │ 121.8 MB │ 121.8 MB │ 121.8 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 33449 │ 33449 │ 33449 │ 33449 │ │ + │ │ │ 33447 │ 33447 │ 33447 │ 33447 │ │ │ │ │ 89.36 MB │ 89.36 MB │ 89.36 MB │ 89.36 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 0 │ 0 │ 0 │ 0.05 │ │ │ │ │ 0 B │ 0 B │ 0 B │ 23.4 B │ │ - │ │ ╰─ room_with_a_view 163.1 ms │ 165 ms │ 163.4 ms │ 163.5 ms │ 100 │ 100 - │ │ 1.85 MB/s │ 1.829 MB/s │ 1.846 MB/s │ 1.846 MB/s │ │ + │ │ ╰─ room_with_a_view 162.1 ms │ 165 ms │ 162.4 ms │ 162.6 ms │ 100 │ 100 + │ │ 1.861 MB/s │ 1.828 MB/s │ 1.857 MB/s │ 1.856 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 18430 │ 18430 │ 18430 │ 18430 │ │ + │ │ 18429 │ 18429 │ 18429 │ 18429 │ │ │ │ 26.32 MB │ 26.32 MB │ 26.32 MB │ 26.32 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 18431 │ 18431 │ 18431 │ 18431 │ │ + │ │ 18430 │ 18430 │ 18430 │ 18430 │ │ │ │ 92.81 MB │ 92.81 MB │ 92.81 MB │ 92.81 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 48815 │ 48815 │ 48815 │ 48815 │ │ + │ │ 48813 │ 48813 │ 48813 │ 48813 │ │ │ │ 66.19 MB │ 66.19 MB │ 66.19 MB │ 66.19 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 24.71 ms │ 25.77 ms │ 24.87 ms │ 24.89 ms │ 100 │ 100 - │ │ │ 6.619 MB/s │ 6.349 MB/s │ 6.578 MB/s │ 6.572 MB/s │ │ + │ │ ├─ romeo_and_juliet 24.83 ms │ 25.61 ms │ 24.92 ms │ 24.94 ms │ 100 │ 100 + │ │ │ 6.587 MB/s │ 6.387 MB/s │ 6.563 MB/s │ 6.56 MB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 1200 │ 1200 │ 1200 │ 1200 │ │ + │ │ │ 1199 │ 1199 │ 1199 │ 1199 │ │ │ │ │ 3.479 MB │ 3.479 MB │ 3.479 MB │ 3.479 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 1201 │ 1201 │ 1201 │ 1201 │ │ + │ │ │ 1200 │ 1200 │ 1200 │ 1200 │ │ │ │ │ 13.58 MB │ 13.58 MB │ 13.58 MB │ 13.58 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 3595 │ 3595 │ 3595 │ 3595 │ │ + │ │ │ 3593 │ 3593 │ 3593 │ 3593 │ │ │ │ │ 9.941 MB │ 9.941 MB │ 9.941 MB │ 9.941 MB │ │ - │ │ ╰─ room_with_a_view 25.47 ms │ 26.45 ms │ 25.59 ms │ 25.61 ms │ 100 │ 100 - │ │ 11.84 MB/s │ 11.41 MB/s │ 11.79 MB/s │ 11.78 MB/s │ │ + │ │ ╰─ room_with_a_view 25.42 ms │ 26.42 ms │ 25.49 ms │ 25.51 ms │ 100 │ 100 + │ │ 11.87 MB/s │ 11.42 MB/s │ 11.84 MB/s │ 11.83 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 2349 │ 2349 │ 2349 │ 2349 │ │ + │ │ 2348 │ 2348 │ 2348 │ 2348 │ │ │ │ 3.353 MB │ 3.353 MB │ 3.353 MB │ 3.353 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 2350 │ 2350 │ 2350 │ 2350 │ │ + │ │ 2349 │ 2349 │ 2349 │ 2349 │ │ │ │ 12.17 MB │ 12.17 MB │ 12.17 MB │ 12.17 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 6219 │ 6219 │ 6219 │ 6219 │ │ + │ │ 6217 │ 6217 │ 6217 │ 6217 │ │ │ │ 8.522 MB │ 8.522 MB │ 8.522 MB │ 8.522 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 3.655 ms │ 4.162 ms │ 3.747 ms │ 3.763 ms │ 100 │ 100 - │ │ │ 44.76 MB/s │ 39.31 MB/s │ 43.66 MB/s │ 43.47 MB/s │ │ + │ │ ├─ romeo_and_juliet 3.707 ms │ 4.07 ms │ 3.804 ms │ 3.808 ms │ 100 │ 100 + │ │ │ 44.12 MB/s │ 40.2 MB/s │ 43 MB/s │ 42.96 MB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 141 │ 141 │ 141 │ 141 │ │ + │ │ │ 140 │ 140 │ 140 │ 140 │ │ │ │ │ 406.1 KB │ 406.1 KB │ 406.1 KB │ 406.1 KB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 142 │ 142 │ 142 │ 142 │ │ - │ │ │ 1.99 MB │ 1.99 MB │ 1.99 MB │ 1.99 MB │ │ + │ │ │ 141 │ 141 │ 141 │ 141 │ │ + │ │ │ 1.989 MB │ 1.989 MB │ 1.989 MB │ 1.989 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 426 │ 426 │ 426 │ 426 │ │ + │ │ │ 424 │ 424 │ 424 │ 424 │ │ │ │ │ 1.42 MB │ 1.42 MB │ 1.42 MB │ 1.42 MB │ │ - │ │ ╰─ room_with_a_view 4.938 ms │ 5.272 ms │ 5.015 ms │ 5.019 ms │ 100 │ 100 - │ │ 61.13 MB/s │ 57.26 MB/s │ 60.19 MB/s │ 60.14 MB/s │ │ + │ │ ╰─ room_with_a_view 4.971 ms │ 5.284 ms │ 5.041 ms │ 5.047 ms │ 100 │ 100 + │ │ 60.72 MB/s │ 57.12 MB/s │ 59.88 MB/s │ 59.81 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 304 │ 304 │ 304 │ 304 │ │ + │ │ 303 │ 303 │ 303 │ 303 │ │ │ │ 430 KB │ 430 KB │ 430 KB │ 430 KB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 305 │ 305 │ 305 │ 305 │ │ + │ │ 304 │ 304 │ 304 │ 304 │ │ │ │ 1.886 MB │ 1.886 MB │ 1.886 MB │ 1.886 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 812 │ 812 │ 812 │ 812 │ │ + │ │ 810 │ 810 │ 810 │ 810 │ │ │ │ 1.154 MB │ 1.154 MB │ 1.154 MB │ 1.154 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.515 ms │ 1.916 ms │ 1.592 ms │ 1.599 ms │ 100 │ 100 - │ │ 107.9 MB/s │ 85.38 MB/s │ 102.7 MB/s │ 102.2 MB/s │ │ + │ ├─ romeo_and_juliet 1.519 ms │ 1.767 ms │ 1.6 ms │ 1.606 ms │ 100 │ 100 + │ │ 107.7 MB/s │ 92.57 MB/s │ 102.2 MB/s │ 101.8 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 33 │ 33 │ 33 │ 33 │ │ - │ │ 107 KB │ 107 KB │ 107 KB │ 107 KB │ │ + │ │ 32 │ 32 │ 32 │ 32 │ │ + │ │ 106.9 KB │ 106.9 KB │ 106.9 KB │ 106.9 KB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 34 │ 34 │ 34 │ 34 │ │ - │ │ 868.3 KB │ 868.3 KB │ 868.3 KB │ 868.3 KB │ │ + │ │ 33 │ 33 │ 33 │ 33 │ │ + │ │ 868 KB │ 868 KB │ 868 KB │ 868 KB │ │ │ │ grow: │ │ │ │ │ - │ │ 107 │ 107 │ 107 │ 107 │ │ - │ │ 597.6 KB │ 597.6 KB │ 597.6 KB │ 597.6 KB │ │ - │ ╰─ room_with_a_view 1.762 ms │ 1.928 ms │ 1.821 ms │ 1.825 ms │ 100 │ 100 - │ 171.3 MB/s │ 156.5 MB/s │ 165.7 MB/s │ 165.3 MB/s │ │ + │ │ 105 │ 105 │ 105 │ 105 │ │ + │ │ 597.5 KB │ 597.5 KB │ 597.5 KB │ 597.5 KB │ │ + │ ╰─ room_with_a_view 1.768 ms │ 1.947 ms │ 1.82 ms │ 1.827 ms │ 100 │ 100 + │ 170.7 MB/s │ 154.9 MB/s │ 165.8 MB/s │ 165.2 MB/s │ │ │ alloc: │ │ │ │ │ - │ 42 │ 42 │ 42 │ 42 │ │ - │ 56.02 KB │ 56.02 KB │ 56.02 KB │ 56.02 KB │ │ + │ 41 │ 41 │ 41 │ 41 │ │ + │ 55.95 KB │ 55.95 KB │ 55.95 KB │ 55.95 KB │ │ │ dealloc: │ │ │ │ │ - │ 43 │ 43 │ 43 │ 43 │ │ - │ 564.3 KB │ 564.3 KB │ 564.3 KB │ 564.3 KB │ │ + │ 42 │ 42 │ 42 │ 42 │ │ + │ 564.1 KB │ 564.1 KB │ 564.1 KB │ 564.1 KB │ │ │ grow: │ │ │ │ │ - │ 113 │ 113 │ 113 │ 113 │ │ - │ 206.4 KB │ 206.4 KB │ 206.4 KB │ 206.4 KB │ │ + │ 111 │ 111 │ 111 │ 111 │ │ + │ 206.2 KB │ 206.2 KB │ 206.2 KB │ 206.2 KB │ │ ├─ tiktoken │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 794.7 ms │ 823.4 ms │ 802.7 ms │ 803.1 ms │ 100 │ 100 - │ │ │ 205.8 KB/s │ 198.7 KB/s │ 203.8 KB/s │ 203.7 KB/s │ │ + │ │ ├─ romeo_and_juliet 804.4 ms │ 950.7 ms │ 810.7 ms │ 814.4 ms │ 100 │ 100 + │ │ │ 203.3 KB/s │ 172 KB/s │ 201.8 KB/s │ 200.9 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 8687901 │ 8687901 │ 8687901 │ 8687901 │ │ + │ │ │ 8687900 │ 8687900 │ 8687900 │ 8687900 │ │ │ │ │ 413.2 MB │ 413.2 MB │ 413.2 MB │ 413.2 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 8999409 │ 8999409 │ 8999409 │ 8999409 │ │ + │ │ │ 8999408 │ 8999408 │ 8999408 │ 8999408 │ │ │ │ │ 676.9 MB │ 676.9 MB │ 676.9 MB │ 676.9 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 1801556 │ 1801556 │ 1801556 │ 1801556 │ │ + │ │ │ 1801554 │ 1801554 │ 1801554 │ 1801554 │ │ │ │ │ 245.2 MB │ 245.2 MB │ 245.2 MB │ 245.2 MB │ │ - │ │ ╰─ room_with_a_view 1.051 s │ 1.381 s │ 1.062 s │ 1.065 s │ 100 │ 100 - │ │ 287 KB/s │ 218.6 KB/s │ 284.1 KB/s │ 283.3 KB/s │ │ + │ │ ╰─ room_with_a_view 1.068 s │ 1.265 s │ 1.104 s │ 1.116 s │ 100 │ 100 + │ │ 282.6 KB/s │ 238.6 KB/s │ 273.4 KB/s │ 270.3 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 11500303 │ 11500303 │ 11500303 │ 11500303 │ │ + │ │ 11500302 │ 11500302 │ 11500302 │ 11500302 │ │ │ │ 551.9 MB │ 551.9 MB │ 551.9 MB │ 551.9 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 11811811 │ 11811811 │ 11811811 │ 11811811 │ │ + │ │ 11811810 │ 11811810 │ 11811810 │ 11811810 │ │ │ │ 941.2 MB │ 941.2 MB │ 941.2 MB │ 941.2 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 2834270 │ 2834270 │ 2834270 │ 2834270 │ │ + │ │ 2834268 │ 2834268 │ 2834268 │ 2834268 │ │ │ │ 370.6 MB │ 370.6 MB │ 370.6 MB │ 370.6 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 258.8 ms │ 266.9 ms │ 260.3 ms │ 260.6 ms │ 100 │ 100 - │ │ │ 632 KB/s │ 612.9 KB/s │ 628.4 KB/s │ 627.8 KB/s │ │ + │ │ ├─ romeo_and_juliet 281.6 ms │ 303.8 ms │ 284.9 ms │ 288.1 ms │ 100 │ 100 + │ │ │ 580.9 KB/s │ 538.4 KB/s │ 574.1 KB/s │ 567.8 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2921740 │ 2921740 │ 2921740 │ 2921740 │ │ + │ │ │ 2921739 │ 2921739 │ 2921739 │ 2921739 │ │ │ │ │ 137.3 MB │ 137.3 MB │ 137.3 MB │ 137.3 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 3233248 │ 3233248 │ 3233248 │ 3233248 │ │ + │ │ │ 3233247 │ 3233247 │ 3233247 │ 3233247 │ │ │ │ │ 234.4 MB │ 234.4 MB │ 234.4 MB │ 234.4 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 606069 │ 606069 │ 606069 │ 606069 │ │ + │ │ │ 606067 │ 606067 │ 606067 │ 606067 │ │ │ │ │ 78.61 MB │ 78.61 MB │ 78.61 MB │ 78.61 MB │ │ - │ │ ╰─ room_with_a_view 436.4 ms │ 445.7 ms │ 439.1 ms │ 439.3 ms │ 100 │ 100 - │ │ 691.7 KB/s │ 677.3 KB/s │ 687.4 KB/s │ 687.2 KB/s │ │ + │ │ ╰─ room_with_a_view 479.2 ms │ 494.8 ms │ 484.3 ms │ 484.4 ms │ 100 │ 100 + │ │ 629.9 KB/s │ 610.1 KB/s │ 623.3 KB/s │ 623.2 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 4881128 │ 4881128 │ 4881128 │ 4881128 │ │ + │ │ 4881127 │ 4881127 │ 4881127 │ 4881127 │ │ │ │ 232.3 MB │ 232.3 MB │ 232.3 MB │ 232.3 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 5192636 │ 5192636 │ 5192636 │ 5192636 │ │ + │ │ 5192635 │ 5192635 │ 5192635 │ 5192635 │ │ │ │ 403.9 MB │ 403.9 MB │ 403.9 MB │ 403.9 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 1197369 │ 1197369 │ 1197369 │ 1197369 │ │ + │ │ 1197367 │ 1197367 │ 1197367 │ 1197367 │ │ │ │ 152.9 MB │ 152.9 MB │ 152.9 MB │ 152.9 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 177 ms │ 180.1 ms │ 178 ms │ 178 ms │ 100 │ 100 - │ │ │ 923.9 KB/s │ 908.1 KB/s │ 918.8 KB/s │ 918.9 KB/s │ │ + │ │ ├─ romeo_and_juliet 193.2 ms │ 205.3 ms │ 196.1 ms │ 196.3 ms │ 100 │ 100 + │ │ │ 846.4 KB/s │ 796.8 KB/s │ 834.1 KB/s │ 833.4 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2018346 │ 2018346 │ 2018346 │ 2018346 │ │ + │ │ │ 2018345 │ 2018345 │ 2018345 │ 2018345 │ │ │ │ │ 94.51 MB │ 94.51 MB │ 94.51 MB │ 94.51 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 2329854 │ 2329854 │ 2329854 │ 2329854 │ │ + │ │ │ 2329853 │ 2329853 │ 2329853 │ 2329853 │ │ │ │ │ 166.3 MB │ 166.3 MB │ 166.3 MB │ 166.3 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 418451 │ 418451 │ 418451 │ 418451 │ │ + │ │ │ 418449 │ 418449 │ 418449 │ 418449 │ │ │ │ │ 53.36 MB │ 53.36 MB │ 53.36 MB │ 53.36 MB │ │ - │ │ ╰─ room_with_a_view 317.7 ms │ 335.2 ms │ 320 ms │ 320.6 ms │ 100 │ 100 - │ │ 950.1 KB/s │ 900.6 KB/s │ 943.2 KB/s │ 941.4 KB/s │ │ + │ │ ╰─ room_with_a_view 352 ms │ 361.4 ms │ 355 ms │ 355.4 ms │ 100 │ 100 + │ │ 857.5 KB/s │ 835.2 KB/s │ 850.2 KB/s │ 849.2 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 3573121 │ 3573121 │ 3573121 │ 3573121 │ │ + │ │ 3573120 │ 3573120 │ 3573120 │ 3573120 │ │ │ │ 169.7 MB │ 169.7 MB │ 169.7 MB │ 169.7 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 3884629 │ 3884629 │ 3884629 │ 3884629 │ │ + │ │ 3884628 │ 3884628 │ 3884628 │ 3884628 │ │ │ │ 299.2 MB │ 299.2 MB │ 299.2 MB │ 299.2 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 874507 │ 874507 │ 874507 │ 874507 │ │ + │ │ 874505 │ 874505 │ 874505 │ 874505 │ │ │ │ 110.8 MB │ 110.8 MB │ 110.8 MB │ 110.8 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 81.79 ms │ 84.17 ms │ 82.27 ms │ 82.31 ms │ 100 │ 100 - │ │ 2 MB/s │ 1.943 MB/s │ 1.988 MB/s │ 1.987 MB/s │ │ + │ ├─ romeo_and_juliet 90.55 ms │ 94.8 ms │ 92.07 ms │ 92.26 ms │ 100 │ 100 + │ │ 1.806 MB/s │ 1.725 MB/s │ 1.777 MB/s │ 1.773 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 914681 │ 914681 │ 914681 │ 914681 │ │ - │ │ 42.86 MB │ 42.86 MB │ 42.86 MB │ 42.86 MB │ │ + │ │ 914680 │ 914680 │ 914680 │ 914680 │ │ + │ │ 42.85 MB │ 42.85 MB │ 42.85 MB │ 42.85 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 1226189 │ 1226189 │ 1226189 │ 1226189 │ │ + │ │ 1226188 │ 1226188 │ 1226188 │ 1226188 │ │ │ │ 85.71 MB │ 85.71 MB │ 85.71 MB │ 85.71 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 187707 │ 187707 │ 187707 │ 187707 │ │ + │ │ 187705 │ 187705 │ 187705 │ 187705 │ │ │ │ 24.37 MB │ 24.37 MB │ 24.37 MB │ 24.37 MB │ │ - │ ╰─ room_with_a_view 111.9 ms │ 119.1 ms │ 112.8 ms │ 112.9 ms │ 100 │ 100 - │ 2.696 MB/s │ 2.533 MB/s │ 2.675 MB/s │ 2.671 MB/s │ │ + │ ╰─ room_with_a_view 124.3 ms │ 130.2 ms │ 126.9 ms │ 126.9 ms │ 100 │ 100 + │ 2.427 MB/s │ 2.318 MB/s │ 2.378 MB/s │ 2.378 MB/s │ │ │ alloc: │ │ │ │ │ - │ 1232443 │ 1232443 │ 1232443 │ 1232443 │ │ + │ 1232442 │ 1232442 │ 1232442 │ 1232442 │ │ │ 58.6 MB │ 58.6 MB │ 58.6 MB │ 58.6 MB │ │ │ dealloc: │ │ │ │ │ - │ 1543951 │ 1543951 │ 1543951 │ 1543951 │ │ + │ 1543950 │ 1543950 │ 1543950 │ 1543950 │ │ │ 115.4 MB │ 115.4 MB │ 115.4 MB │ 115.4 MB │ │ │ grow: │ │ │ │ │ - │ 300739 │ 300739 │ 300739 │ 300739 │ │ + │ 300737 │ 300737 │ 300737 │ 300737 │ │ │ 38.19 MB │ 38.19 MB │ 38.19 MB │ 38.19 MB │ │ ╰─ tokenizers │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.39 s │ 1.502 s │ 1.406 s │ 1.41 s │ 100 │ 100 - │ │ 117.6 KB/s │ 108.8 KB/s │ 116.3 KB/s │ 116 KB/s │ │ + │ ├─ romeo_and_juliet 1.479 s │ 1.58 s │ 1.51 s │ 1.514 s │ 100 │ 100 + │ │ 110.5 KB/s │ 103.5 KB/s │ 108.3 KB/s │ 108 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 29188728 │ 29188728 │ 29188728 │ 29188728 │ │ + │ │ 29188727 │ 29188727 │ 29188727 │ 29188727 │ │ │ │ 3.601 GB │ 3.601 GB │ 3.601 GB │ 3.601 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 29246781 │ 29246781 │ 29246781 │ 29246781 │ │ + │ │ 29246780 │ 29246780 │ 29246780 │ 29246780 │ │ │ │ 5.214 GB │ 5.214 GB │ 5.214 GB │ 5.214 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 463032 │ 463032 │ 463032 │ 463032 │ │ + │ │ 463030 │ 463030 │ 463030 │ 463030 │ │ │ │ 1.608 GB │ 1.608 GB │ 1.608 GB │ 1.608 GB │ │ - │ ╰─ room_with_a_view 1.933 s │ 2.107 s │ 1.943 s │ 1.946 s │ 100 │ 100 - │ 156.1 KB/s │ 143.2 KB/s │ 155.3 KB/s │ 155 KB/s │ │ + │ ╰─ room_with_a_view 2.063 s │ 2.089 s │ 2.074 s │ 2.075 s │ 100 │ 100 + │ 146.3 KB/s │ 144.4 KB/s │ 145.5 KB/s │ 145.4 KB/s │ │ │ alloc: │ │ │ │ │ - │ 39390416 │ 39390416 │ 39390416 │ 39390416 │ │ + │ 39390415 │ 39390415 │ 39390415 │ 39390415 │ │ │ 5.158 GB │ 5.158 GB │ 5.158 GB │ 5.158 GB │ │ │ dealloc: │ │ │ │ │ - │ 39448469 │ 39448469 │ 39448469 │ 39448469 │ │ + │ 39448468 │ 39448468 │ 39448468 │ 39448468 │ │ │ 7.421 GB │ 7.421 GB │ 7.421 GB │ 7.421 GB │ │ │ grow: │ │ │ │ │ - │ 681205 │ 681205 │ 681205 │ 681205 │ │ + │ 681203 │ 681203 │ 681203 │ 681203 │ │ │ 2.257 GB │ 2.257 GB │ 2.257 GB │ 2.257 GB │ │ ├─ 512 │ │ │ │ │ - │ ├─ romeo_and_juliet 432.2 ms │ 443 ms │ 437.7 ms │ 437.4 ms │ 100 │ 100 - │ │ 378.5 KB/s │ 369.2 KB/s │ 373.8 KB/s │ 374 KB/s │ │ + │ ├─ romeo_and_juliet 465.3 ms │ 476.5 ms │ 469.4 ms │ 469.4 ms │ 100 │ 100 + │ │ 351.6 KB/s │ 343.3 KB/s │ 348.5 KB/s │ 348.5 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 9331641 │ 9331641 │ 9331641 │ 9331641 │ │ + │ │ 9331640 │ 9331640 │ 9331640 │ 9331640 │ │ │ │ 1.159 GB │ 1.159 GB │ 1.159 GB │ 1.159 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 9389694 │ 9389694 │ 9389694 │ 9389694 │ │ + │ │ 9389693 │ 9389693 │ 9389693 │ 9389693 │ │ │ │ 1.678 GB │ 1.678 GB │ 1.678 GB │ 1.678 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 100059 │ 100059 │ 100059 │ 100059 │ │ + │ │ 100057 │ 100057 │ 100057 │ 100057 │ │ │ │ 513.7 MB │ 513.7 MB │ 513.7 MB │ 513.7 MB │ │ - │ ╰─ room_with_a_view 793.5 ms │ 831.4 ms │ 798.5 ms │ 800.2 ms │ 100 │ 100 - │ 380.4 KB/s │ 363.1 KB/s │ 378 KB/s │ 377.2 KB/s │ │ + │ ╰─ room_with_a_view 853.3 ms │ 907.1 ms │ 861.3 ms │ 865.8 ms │ 100 │ 100 + │ 353.7 KB/s │ 332.8 KB/s │ 350.5 KB/s │ 348.6 KB/s │ │ │ alloc: │ │ │ │ │ - │ 16335240 │ 16335240 │ 16335240 │ 16335240 │ │ + │ 16335239 │ 16335239 │ 16335239 │ 16335239 │ │ │ 2.154 GB │ 2.154 GB │ 2.154 GB │ 2.154 GB │ │ │ dealloc: │ │ │ │ │ - │ 16393293 │ 16393293 │ 16393293 │ 16393293 │ │ + │ 16393292 │ 16393292 │ 16393292 │ 16393292 │ │ │ 3.105 GB │ 3.105 GB │ 3.105 GB │ 3.105 GB │ │ │ grow: │ │ │ │ │ - │ 165455 │ 165455 │ 165455 │ 165455 │ │ + │ 165453 │ 165453 │ 165453 │ 165453 │ │ │ 945.8 MB │ 945.8 MB │ 945.8 MB │ 945.8 MB │ │ ├─ 4096 │ │ │ │ │ - │ ├─ romeo_and_juliet 297.7 ms │ 300.7 ms │ 298.6 ms │ 298.7 ms │ 100 │ 100 - │ │ 549.5 KB/s │ 544 KB/s │ 547.9 KB/s │ 547.6 KB/s │ │ + │ ├─ romeo_and_juliet 318.5 ms │ 349.5 ms │ 321.8 ms │ 322.6 ms │ 100 │ 100 + │ │ 513.6 KB/s │ 468 KB/s │ 508.3 KB/s │ 507.1 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 6432740 │ 6432740 │ 6432740 │ 6432740 │ │ + │ │ 6432739 │ 6432739 │ 6432739 │ 6432739 │ │ │ │ 802.5 MB │ 802.5 MB │ 802.5 MB │ 802.5 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 6490793 │ 6490793 │ 6490793 │ 6490793 │ │ + │ │ 6490792 │ 6490792 │ 6490792 │ 6490792 │ │ │ │ 1.164 GB │ 1.164 GB │ 1.164 GB │ 1.164 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 34250 │ 34250 │ 34250 │ 34250 │ │ + │ │ 34248 │ 34248 │ 34248 │ 34248 │ │ │ │ 356.8 MB │ 356.8 MB │ 356.8 MB │ 356.8 MB │ │ - │ ╰─ room_with_a_view 560.3 ms │ 575.2 ms │ 565.8 ms │ 565.8 ms │ 100 │ 100 - │ 538.8 KB/s │ 524.8 KB/s │ 533.5 KB/s │ 533.5 KB/s │ │ + │ ╰─ room_with_a_view 607.1 ms │ 618.4 ms │ 612.7 ms │ 612.6 ms │ 100 │ 100 + │ 497.2 KB/s │ 488.2 KB/s │ 492.7 KB/s │ 492.8 KB/s │ │ │ alloc: │ │ │ │ │ - │ 11604279 │ 11604279 │ 11604279 │ 11604279 │ │ + │ 11604278 │ 11604278 │ 11604278 │ 11604278 │ │ │ 1.536 GB │ 1.536 GB │ 1.536 GB │ 1.536 GB │ │ │ dealloc: │ │ │ │ │ - │ 11662332 │ 11662332 │ 11662332 │ 11662332 │ │ + │ 11662331 │ 11662331 │ 11662331 │ 11662331 │ │ │ 2.215 GB │ 2.215 GB │ 2.215 GB │ 2.215 GB │ │ │ grow: │ │ │ │ │ - │ 55754 │ 55754 │ 55754 │ 55754 │ │ + │ 55752 │ 55752 │ 55752 │ 55752 │ │ │ 674.3 MB │ 674.3 MB │ 674.3 MB │ 674.3 MB │ │ ╰─ 32768 │ │ │ │ │ - ├─ romeo_and_juliet 131 ms │ 134.2 ms │ 131.6 ms │ 131.7 ms │ 100 │ 100 - │ 1.248 MB/s │ 1.218 MB/s │ 1.242 MB/s │ 1.241 MB/s │ │ + ├─ romeo_and_juliet 143.6 ms │ 160.8 ms │ 146 ms │ 146.3 ms │ 100 │ 100 + │ 1.138 MB/s │ 1.017 MB/s │ 1.12 MB/s │ 1.118 MB/s │ │ │ alloc: │ │ │ │ │ - │ 2845252 │ 2845252 │ 2845252 │ 2845252 │ │ + │ 2845251 │ 2845251 │ 2845251 │ 2845251 │ │ │ 353.7 MB │ 353.7 MB │ 353.7 MB │ 353.7 MB │ │ │ dealloc: │ │ │ │ │ - │ 2903305 │ 2903305 │ 2903305 │ 2903305 │ │ + │ 2903304 │ 2903304 │ 2903304 │ 2903304 │ │ │ 517.5 MB │ 517.5 MB │ 517.5 MB │ 517.5 MB │ │ │ grow: │ │ │ │ │ - │ 9650 │ 9650 │ 9650 │ 9650 │ │ + │ 9648 │ 9648 │ 9648 │ 9648 │ │ │ 158.8 MB │ 158.8 MB │ 158.8 MB │ 158.8 MB │ │ - ╰─ room_with_a_view 218.2 ms │ 230.2 ms │ 219.1 ms │ 219.4 ms │ 100 │ 100 - 1.383 MB/s │ 1.311 MB/s │ 1.377 MB/s │ 1.375 MB/s │ │ + ╰─ room_with_a_view 238.4 ms │ 264.2 ms │ 242.4 ms │ 243 ms │ 100 │ 100 + 1.266 MB/s │ 1.142 MB/s │ 1.245 MB/s │ 1.242 MB/s │ │ alloc: │ │ │ │ │ - 4490084 │ 4490084 │ 4490084 │ 4490084 │ │ + 4490083 │ 4490083 │ 4490083 │ 4490083 │ │ 594.3 MB │ 594.3 MB │ 594.3 MB │ 594.3 MB │ │ dealloc: │ │ │ │ │ - 4548137 │ 4548137 │ 4548137 │ 4548137 │ │ + 4548136 │ 4548136 │ 4548136 │ 4548136 │ │ 860.9 MB │ 860.9 MB │ 860.9 MB │ 860.9 MB │ │ grow: │ │ │ │ │ - 14226 │ 14226 │ 14226 │ 14226 │ │ + 14224 │ 14224 │ 14224 │ 14224 │ │ 261.5 MB │ 261.5 MB │ 261.5 MB │ 261.5 MB │ │ diff --git a/src/lib.rs b/src/lib.rs index f94b988..98147e6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -88,40 +88,49 @@ pub trait ChunkSizer { /// Very helpful when the same chunk is being validated multiple times, which /// happens often, and can be expensive to compute, such as with tokenizers. #[derive(Debug)] -struct MemoizedChunkSizer<'sizer, S> +struct MemoizedChunkSizer<'sizer, C, S> where + C: ChunkCapacity, S: ChunkSizer, { - /// The sizer we are wrapping - sizer: &'sizer S, /// Cache of chunk sizes per byte offset range cache: AHashMap, ChunkSize>, + /// How big can each chunk be + chunk_capacity: C, + /// The sizer we are wrapping + sizer: &'sizer S, } -impl<'sizer, S> MemoizedChunkSizer<'sizer, S> +impl<'sizer, C, S> MemoizedChunkSizer<'sizer, C, S> where + C: ChunkCapacity, S: ChunkSizer, { /// Wrap any chunk sizer for memoization - fn new(sizer: &'sizer S) -> Self { + fn new(chunk_capacity: C, sizer: &'sizer S) -> Self { Self { + cache: AHashMap::new(), + chunk_capacity, sizer, - cache: AHashMap::default(), } } /// Determine the size of a given chunk to use for validation, /// returning a cached value if it exists, and storing the result if not. - fn chunk_size( - &mut self, - offset: usize, - chunk: &str, - capacity: &impl ChunkCapacity, - ) -> ChunkSize { + fn chunk_size(&mut self, offset: usize, chunk: &str) -> ChunkSize { *self .cache .entry(offset..(offset + chunk.len())) - .or_insert_with(|| self.sizer.chunk_size(chunk, capacity)) + .or_insert_with(|| self.sizer.chunk_size(chunk, &self.chunk_capacity)) + } + + /// Check if the chunk is within the capacity. Chunk should be trimmed if necessary beforehand. + fn check_capacity(&mut self, (offset, chunk): (usize, &str)) -> ChunkSize { + let mut chunk_size = self.chunk_size(offset, chunk); + if let Some(max_chunk_size_offset) = chunk_size.max_chunk_size_offset.as_mut() { + *max_chunk_size_offset += offset; + } + chunk_size } /// Clear the cached values. Once we've moved the cursor, @@ -321,14 +330,10 @@ where S: ChunkSizer, Sp: SemanticSplit, { - /// Size of the chunks to generate - chunk_capacity: C, /// How to validate chunk sizes - chunk_sizer: MemoizedChunkSizer<'sizer, S>, + chunk_sizer: MemoizedChunkSizer<'sizer, C, S>, /// Current byte offset in the `text` cursor: usize, - /// Reusable container for levels in remaining text to avoid extra allocations - levels_in_remaining_text: Vec, /// Reusable container for next sections to avoid extra allocations next_sections: Vec<(usize, &'text str)>, /// Splitter used for determining semantic levels. @@ -350,9 +355,7 @@ where fn new(chunk_capacity: C, chunk_sizer: &'sizer S, text: &'text str, trim_chunks: bool) -> Self { Self { cursor: 0, - chunk_capacity, - chunk_sizer: MemoizedChunkSizer::new(chunk_sizer), - levels_in_remaining_text: Vec::new(), + chunk_sizer: MemoizedChunkSizer::new(chunk_capacity, chunk_sizer), next_sections: Vec::new(), semantic_split: Sp::new(text), text, @@ -369,18 +372,6 @@ where } } - /// Is the given text within the chunk size? - fn check_capacity(&mut self, offset: usize, chunk: &str) -> ChunkSize { - let (offset, chunk) = self.trim_chunk(offset, chunk); - let mut chunk_size = self - .chunk_sizer - .chunk_size(offset, chunk, &self.chunk_capacity); - if let Some(max_chunk_size_offset) = chunk_size.max_chunk_size_offset.as_mut() { - *max_chunk_size_offset += offset; - } - chunk_size - } - /// Generate the next chunk, applying trimming settings. /// Returns final byte offset and str. /// Will return `None` if given an invalid range. @@ -402,7 +393,9 @@ where let (offset, str) = self.next_sections[mid]; let text_end = offset + str.len(); let chunk = self.text.get(start..text_end)?; - let chunk_size = self.check_capacity(start, chunk); + let chunk_size = self + .chunk_sizer + .check_capacity(self.trim_chunk(start, chunk)); match chunk_size.fits { Ordering::Less => { @@ -447,11 +440,17 @@ where if let (Some(successful_index), Some(chunk_size)) = (successful_index, successful_chunk_size) { - for index in successful_index..self.next_sections.len() { + let mut range = successful_index..self.next_sections.len(); + // We've already checked the successful index + range.next(); + + for index in range { let (offset, str) = self.next_sections[index]; let text_end = offset + str.len(); let chunk = self.text.get(start..text_end)?; - let size = self.check_capacity(start, chunk); + let size = self + .chunk_sizer + .check_capacity(self.trim_chunk(start, chunk)); if size.size <= chunk_size.size { if text_end > end { end = text_end; @@ -476,28 +475,26 @@ where fn update_next_sections(&mut self) { // First thing, clear out the list, but reuse the allocated memory self.next_sections.clear(); - self.levels_in_remaining_text.clear(); - // Next levels to try. Will stop at max level. We check only levels in the next max level - // chunk so we don't bypass it if not all levels are present in every chunk. - self.levels_in_remaining_text - .extend(self.semantic_split.levels_in_remaining_text(self.cursor)); // Get starting level - let mut semantic_level = self.levels_in_remaining_text[0]; + let mut levels_in_remaining_text = + self.semantic_split.levels_in_remaining_text(self.cursor); + let mut semantic_level = levels_in_remaining_text + .next() + .expect("Need at least one level to progress"); // If we aren't at the highest semantic level, stop iterating sections that go beyond the range of the next level. let mut max_encoded_offset = None; let remaining_text = self.text.get(self.cursor..).unwrap(); - for i in 1..self.levels_in_remaining_text.len() { - let level = self.levels_in_remaining_text[i]; - let Some((_, str)) = self - .semantic_split + for (level, str) in levels_in_remaining_text.filter_map(|level| { + self.semantic_split .semantic_chunks(self.cursor, remaining_text, level) .next() - else { - return; - }; - let chunk_size = self.check_capacity(self.cursor, str); + .map(|(_, str)| (level, str)) + }) { + let chunk_size = self + .chunk_sizer + .check_capacity(self.trim_chunk(self.cursor, str)); // If this no longer fits, we use the level we are at. if chunk_size.fits.is_gt() { max_encoded_offset = chunk_size.max_chunk_size_offset; @@ -689,10 +686,10 @@ mod tests { #[test] fn memoized_sizer_only_calculates_once_per_text() { let sizer = CountingSizer::default(); - let mut memoized_sizer = MemoizedChunkSizer::new(&sizer); + let mut memoized_sizer = MemoizedChunkSizer::new(10, &sizer); let text = "1234567890"; for _ in 0..10 { - memoized_sizer.chunk_size(0, text, &10); + memoized_sizer.chunk_size(0, text); } assert_eq!(memoized_sizer.sizer.calls.load(atomic::Ordering::SeqCst), 1); @@ -701,10 +698,10 @@ mod tests { #[test] fn memoized_sizer_calculates_once_per_different_text() { let sizer = CountingSizer::default(); - let mut memoized_sizer = MemoizedChunkSizer::new(&sizer); + let mut memoized_sizer = MemoizedChunkSizer::new(10, &sizer); let text = "1234567890"; for i in 0..10 { - memoized_sizer.chunk_size(0, text.get(0..i).unwrap(), &10); + memoized_sizer.chunk_size(0, text.get(0..i).unwrap()); } assert_eq!( @@ -716,10 +713,10 @@ mod tests { #[test] fn can_clear_cache_on_memoized_sizer() { let sizer = CountingSizer::default(); - let mut memoized_sizer = MemoizedChunkSizer::new(&sizer); + let mut memoized_sizer = MemoizedChunkSizer::new(10, &sizer); let text = "1234567890"; for _ in 0..10 { - memoized_sizer.chunk_size(0, text, &10); + memoized_sizer.chunk_size(0, text); memoized_sizer.clear_cache(); } From 25e08ed13d5907d7916465540b4daa109ef861fe Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Mon, 25 Mar 2024 14:19:35 +0100 Subject: [PATCH 5/6] Fix bug with ranges after offset Only affects markdown ranges, but it wasn't correctly filtering out the correct ranges because it was finding the first item of a level in all ranges, not after the offset --- benches/output.txt | 252 ++-- src/lib.rs | 21 +- src/markdown.rs | 56 +- src/text.rs | 11 +- ...ingface_markdown@commonmark_spec.md-2.snap | 38 +- ...ggingface_markdown@commonmark_spec.md.snap | 1044 ++++++------- ...ggingface_markdown@github_flavored.md.snap | 47 +- ...ce_markdown_trim@commonmark_spec.md-2.snap | 38 +- ...face_markdown_trim@commonmark_spec.md.snap | 739 +++++----- ...face_markdown_trim@github_flavored.md.snap | 39 +- ...pshots__markdown@commonmark_spec.md-2.snap | 463 +++--- ...napshots__markdown@commonmark_spec.md.snap | 599 ++++---- ...napshots__markdown@github_flavored.md.snap | 21 +- ...s__markdown_trim@commonmark_spec.md-2.snap | 431 +++--- ...ots__markdown_trim@commonmark_spec.md.snap | 241 ++- ...ots__markdown_trim@github_flavored.md.snap | 9 +- ...iktoken_markdown@commonmark_spec.md-2.snap | 59 +- ..._tiktoken_markdown@commonmark_spec.md.snap | 1298 +++++++++-------- ..._tiktoken_markdown@github_flavored.md.snap | 46 +- ...en_markdown_trim@commonmark_spec.md-2.snap | 59 +- ...oken_markdown_trim@commonmark_spec.md.snap | 874 +++++------ ...oken_markdown_trim@github_flavored.md.snap | 42 +- 22 files changed, 3343 insertions(+), 3084 deletions(-) diff --git a/benches/output.txt b/benches/output.txt index 87173d0..42e76d3 100644 --- a/benches/output.txt +++ b/benches/output.txt @@ -7,38 +7,38 @@ chunk_size fastest │ slowest │ median ├─ markdown │ │ │ │ │ │ ├─ characters │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 329.8 ms │ 360.1 ms │ 332.6 ms │ 333.2 ms │ 100 │ 100 -│ │ │ 621.5 KB/s │ 569.2 KB/s │ 616.3 KB/s │ 615.2 KB/s │ │ +│ │ │ ╰─ commonmark_spec 324 ms │ 336.2 ms │ 331.4 ms │ 331.4 ms │ 100 │ 100 +│ │ │ 632.6 KB/s │ 609.7 KB/s │ 618.4 KB/s │ 618.6 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 13269 │ 0 │ 13269 │ 13136 │ │ -│ │ │ 75.99 MB │ 0 B │ 75.99 MB │ 75.23 MB │ │ +│ │ │ 0 │ 13479 │ 13479 │ 13344 │ │ +│ │ │ 0 B │ 77.19 MB │ 77.19 MB │ 76.42 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 13270 │ 0 │ 13270 │ 13137 │ │ -│ │ │ 281.4 MB │ 0 B │ 281.4 MB │ 278.6 MB │ │ +│ │ │ 0 │ 13480 │ 13480 │ 13345 │ │ +│ │ │ 0 B │ 285.8 MB │ 285.8 MB │ 283 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 44733 │ 0 │ 44733 │ 44285 │ │ -│ │ │ 205.2 MB │ 0 B │ 205.2 MB │ 203.2 MB │ │ +│ │ │ 0 │ 45433 │ 45433 │ 44978 │ │ +│ │ │ 0 B │ 208.4 MB │ 208.4 MB │ 206.3 MB │ │ │ │ │ shrink: │ │ │ │ │ -│ │ │ 13 │ 0 │ 13 │ 12.87 │ │ -│ │ │ 94 B │ 0 B │ 94 B │ 93.06 B │ │ +│ │ │ 0 │ 13 │ 13 │ 12.87 │ │ +│ │ │ 0 B │ 94 B │ 94 B │ 93.06 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 45.92 ms │ 46.7 ms │ 46.12 ms │ 46.15 ms │ 100 │ 100 -│ │ │ 4.464 MB/s │ 4.389 MB/s │ 4.444 MB/s │ 4.441 MB/s │ │ +│ │ │ ╰─ commonmark_spec 47.13 ms │ 47.96 ms │ 47.33 ms │ 47.35 ms │ 100 │ 100 +│ │ │ 4.349 MB/s │ 4.274 MB/s │ 4.331 MB/s │ 4.329 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 1575 │ 1575 │ 1575 │ 1575 │ │ -│ │ │ 9.241 MB │ 9.241 MB │ 9.241 MB │ 9.241 MB │ │ +│ │ │ 1617 │ 1617 │ 1617 │ 1617 │ │ +│ │ │ 9.47 MB │ 9.47 MB │ 9.47 MB │ 9.47 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 1576 │ 1576 │ 1576 │ 1576 │ │ -│ │ │ 34.52 MB │ 34.52 MB │ 34.52 MB │ 34.52 MB │ │ +│ │ │ 1618 │ 1618 │ 1618 │ 1618 │ │ +│ │ │ 35.45 MB │ 35.45 MB │ 35.45 MB │ 35.45 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 5253 │ 5253 │ 5253 │ 5253 │ │ -│ │ │ 25.08 MB │ 25.08 MB │ 25.08 MB │ 25.08 MB │ │ +│ │ │ 5391 │ 5391 │ 5391 │ 5391 │ │ +│ │ │ 25.77 MB │ 25.77 MB │ 25.77 MB │ 25.77 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 8.379 ms │ 8.773 ms │ 8.426 ms │ 8.433 ms │ 100 │ 100 -│ │ │ 24.46 MB/s │ 23.36 MB/s │ 24.33 MB/s │ 24.31 MB/s │ │ +│ │ │ ╰─ commonmark_spec 8.49 ms │ 8.972 ms │ 8.594 ms │ 8.595 ms │ 100 │ 100 +│ │ │ 24.14 MB/s │ 22.85 MB/s │ 23.85 MB/s │ 23.85 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 260 │ 260 │ 260 │ 260 │ │ │ │ │ 1.663 MB │ 1.663 MB │ 1.663 MB │ 1.663 MB │ │ @@ -52,8 +52,8 @@ chunk_size fastest │ slowest │ median │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 1.946 ms │ 2.138 ms │ 1.956 ms │ 1.965 ms │ 100 │ 100 -│ │ 105.3 MB/s │ 95.87 MB/s │ 104.7 MB/s │ 104.3 MB/s │ │ +│ │ ╰─ commonmark_spec 2.008 ms │ 2.2 ms │ 2.082 ms │ 2.082 ms │ 100 │ 100 +│ │ 102 MB/s │ 93.16 MB/s │ 98.47 MB/s │ 98.47 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 65 │ 65 │ 65 │ 65 │ │ │ │ 528.4 KB │ 528.4 KB │ 528.4 KB │ 528.4 KB │ │ @@ -68,119 +68,119 @@ chunk_size fastest │ slowest │ median │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ tiktoken │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 832.7 ms │ 927.9 ms │ 868.5 ms │ 864.8 ms │ 100 │ 100 -│ │ │ 246.2 KB/s │ 220.9 KB/s │ 236 KB/s │ 237 KB/s │ │ +│ │ │ ╰─ commonmark_spec 866 ms │ 956.2 ms │ 872.7 ms │ 874.2 ms │ 100 │ 100 +│ │ │ 236.7 KB/s │ 214.3 KB/s │ 234.9 KB/s │ 234.5 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 8103001 │ 8103001 │ 8103001 │ 8103001 │ │ -│ │ │ 394.3 MB │ 394.3 MB │ 394.3 MB │ 394.3 MB │ │ +│ │ │ 8147015 │ 8147015 │ 8147015 │ 8147015 │ │ +│ │ │ 397 MB │ 397 MB │ 397 MB │ 397 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 8414509 │ 8414509 │ 8414509 │ 8414509 │ │ -│ │ │ 745 MB │ 745 MB │ 745 MB │ 745 MB │ │ +│ │ │ 8458523 │ 8458523 │ 8458523 │ 8458523 │ │ +│ │ │ 750.9 MB │ 750.9 MB │ 750.9 MB │ 750.9 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 1466108 │ 1466108 │ 1466108 │ 1466108 │ │ -│ │ │ 332.1 MB │ 332.1 MB │ 332.1 MB │ 332.1 MB │ │ +│ │ │ 1478494 │ 1478494 │ 1478494 │ 1478494 │ │ +│ │ │ 335.3 MB │ 335.3 MB │ 335.3 MB │ 335.3 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 290.2 ms │ 292.4 ms │ 291 ms │ 291 ms │ 100 │ 100 -│ │ │ 706.4 KB/s │ 700.9 KB/s │ 704.5 KB/s │ 704.4 KB/s │ │ +│ │ │ ╰─ commonmark_spec 289.4 ms │ 295 ms │ 290.6 ms │ 291.1 ms │ 100 │ 100 +│ │ │ 708.4 KB/s │ 694.7 KB/s │ 705.2 KB/s │ 704.1 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 2947757 │ 2947757 │ 2947757 │ 2947757 │ │ -│ │ │ 138.2 MB │ 138.2 MB │ 138.2 MB │ 138.2 MB │ │ +│ │ │ 2960718 │ 2960718 │ 2960718 │ 2960718 │ │ +│ │ │ 138.8 MB │ 138.8 MB │ 138.8 MB │ 138.8 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 3259265 │ 3259265 │ 3259265 │ 3259265 │ │ -│ │ │ 260.4 MB │ 260.4 MB │ 260.4 MB │ 260.4 MB │ │ +│ │ │ 3272226 │ 3272226 │ 3272226 │ 3272226 │ │ +│ │ │ 261.6 MB │ 261.6 MB │ 261.6 MB │ 261.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 549664 │ 549664 │ 549664 │ 549664 │ │ -│ │ │ 103.6 MB │ 103.6 MB │ 103.6 MB │ 103.6 MB │ │ +│ │ │ 552129 │ 552129 │ 552129 │ 552129 │ │ +│ │ │ 104.2 MB │ 104.2 MB │ 104.2 MB │ 104.2 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 157.4 ms │ 165.7 ms │ 158.3 ms │ 158.7 ms │ 100 │ 100 -│ │ │ 1.302 MB/s │ 1.236 MB/s │ 1.294 MB/s │ 1.291 MB/s │ │ +│ │ │ ╰─ commonmark_spec 156.7 ms │ 159.7 ms │ 157.4 ms │ 157.4 ms │ 100 │ 100 +│ │ │ 1.308 MB/s │ 1.283 MB/s │ 1.302 MB/s │ 1.302 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 1652074 │ 1652074 │ 1652074 │ 1652074 │ │ -│ │ │ 76.51 MB │ 76.51 MB │ 76.51 MB │ 76.51 MB │ │ +│ │ │ 1652427 │ 1652427 │ 1652427 │ 1652427 │ │ +│ │ │ 76.53 MB │ 76.53 MB │ 76.53 MB │ 76.53 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 1963582 │ 1963582 │ 1963582 │ 1963582 │ │ +│ │ │ 1963935 │ 1963935 │ 1963935 │ 1963935 │ │ │ │ │ 150.6 MB │ 150.6 MB │ 150.6 MB │ 150.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 308304 │ 308304 │ 308304 │ 308304 │ │ -│ │ │ 55.62 MB │ 55.62 MB │ 55.62 MB │ 55.62 MB │ │ +│ │ │ 308404 │ 308404 │ 308404 │ 308404 │ │ +│ │ │ 55.63 MB │ 55.63 MB │ 55.63 MB │ 55.63 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 72.99 ms │ 81.69 ms │ 73.33 ms │ 73.45 ms │ 100 │ 100 -│ │ 2.808 MB/s │ 2.509 MB/s │ 2.795 MB/s │ 2.791 MB/s │ │ +│ │ ╰─ commonmark_spec 72.69 ms │ 74.22 ms │ 73.04 ms │ 73.14 ms │ 100 │ 100 +│ │ 2.82 MB/s │ 2.762 MB/s │ 2.807 MB/s │ 2.802 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 750071 │ 750071 │ 750071 │ 750071 │ │ +│ │ 750087 │ 750087 │ 750087 │ 750087 │ │ │ │ 34.96 MB │ 34.96 MB │ 34.96 MB │ 34.96 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 1061579 │ 1061579 │ 1061579 │ 1061579 │ │ +│ │ 1061595 │ 1061595 │ 1061595 │ 1061595 │ │ │ │ 78.87 MB │ 78.87 MB │ 78.87 MB │ 78.87 MB │ │ │ │ grow: │ │ │ │ │ -│ │ 141695 │ 141695 │ 141695 │ 141695 │ │ +│ │ 141696 │ 141696 │ 141696 │ 141696 │ │ │ │ 25.39 MB │ 25.39 MB │ 25.39 MB │ 25.39 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ tokenizers │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ ╰─ commonmark_spec 1.57 s │ 1.712 s │ 1.586 s │ 1.59 s │ 100 │ 100 -│ │ 130.5 KB/s │ 119.7 KB/s │ 129.2 KB/s │ 128.9 KB/s │ │ +│ │ ╰─ commonmark_spec 1.367 s │ 1.415 s │ 1.379 s │ 1.381 s │ 100 │ 100 +│ │ 149.8 KB/s │ 144.8 KB/s │ 148.6 KB/s │ 148.4 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 39101784 │ 39101784 │ 39101784 │ 39101784 │ │ -│ │ 3.731 GB │ 3.731 GB │ 3.731 GB │ 3.731 GB │ │ +│ │ 32934478 │ 32934478 │ 32934478 │ 32934478 │ │ +│ │ 3.172 GB │ 3.172 GB │ 3.172 GB │ 3.172 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 39159837 │ 39159837 │ 39159837 │ 39159837 │ │ -│ │ 5.853 GB │ 5.853 GB │ 5.853 GB │ 5.853 GB │ │ +│ │ 32992531 │ 32992531 │ 32992531 │ 32992531 │ │ +│ │ 4.959 GB │ 4.959 GB │ 4.959 GB │ 4.959 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 1329519 │ 1329519 │ 1329519 │ 1329519 │ │ -│ │ 2.117 GB │ 2.117 GB │ 2.117 GB │ 2.117 GB │ │ +│ │ 1210155 │ 1210155 │ 1210155 │ 1210155 │ │ +│ │ 1.782 GB │ 1.782 GB │ 1.782 GB │ 1.782 GB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 512 │ │ │ │ │ -│ │ ╰─ commonmark_spec 621.9 ms │ 652.9 ms │ 630.6 ms │ 630.4 ms │ 100 │ 100 -│ │ 329.6 KB/s │ 314 KB/s │ 325.1 KB/s │ 325.1 KB/s │ │ +│ │ ╰─ commonmark_spec 621.5 ms │ 657.1 ms │ 630 ms │ 630.5 ms │ 100 │ 100 +│ │ 329.8 KB/s │ 311.9 KB/s │ 325.3 KB/s │ 325.1 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 16074028 │ 16074028 │ 16074028 │ 16074028 │ │ -│ │ 1.579 GB │ 1.579 GB │ 1.579 GB │ 1.579 GB │ │ +│ │ 16110533 │ 16110533 │ 16110533 │ 16110533 │ │ +│ │ 1.581 GB │ 1.581 GB │ 1.581 GB │ 1.581 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 16132081 │ 16132081 │ 16132081 │ 16132081 │ │ -│ │ 2.438 GB │ 2.438 GB │ 2.438 GB │ 2.438 GB │ │ +│ │ 16168586 │ 16168586 │ 16168586 │ 16168586 │ │ +│ │ 2.442 GB │ 2.442 GB │ 2.442 GB │ 2.442 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 392879 │ 392879 │ 392879 │ 392879 │ │ -│ │ 854.6 MB │ 854.6 MB │ 854.6 MB │ 854.6 MB │ │ +│ │ 401309 │ 401309 │ 401309 │ 401309 │ │ +│ │ 856 MB │ 856 MB │ 856 MB │ 856 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 4096 │ │ │ │ │ -│ │ ╰─ commonmark_spec 322.5 ms │ 329.3 ms │ 325 ms │ 325 ms │ 100 │ 100 -│ │ 635.6 KB/s │ 622.5 KB/s │ 630.6 KB/s │ 630.6 KB/s │ │ +│ │ ╰─ commonmark_spec 324 ms │ 329.6 ms │ 325.8 ms │ 326.3 ms │ 100 │ 100 +│ │ 632.7 KB/s │ 622 KB/s │ 629.1 KB/s │ 628.3 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 8491158 │ 8491158 │ 8491158 │ 8491158 │ │ -│ │ 843.3 MB │ 843.3 MB │ 843.3 MB │ 843.3 MB │ │ +│ │ 8494397 │ 8494397 │ 8494397 │ 8494397 │ │ +│ │ 843.6 MB │ 843.6 MB │ 843.6 MB │ 843.6 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 8549211 │ 8549211 │ 8549211 │ 8549211 │ │ +│ │ 8552450 │ 8552450 │ 8552450 │ 8552450 │ │ │ │ 1.292 GB │ 1.292 GB │ 1.292 GB │ 1.292 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 168021 │ 168021 │ 168021 │ 168021 │ │ -│ │ 444.1 MB │ 444.1 MB │ 444.1 MB │ 444.1 MB │ │ +│ │ 168534 │ 168534 │ 168534 │ 168534 │ │ +│ │ 444.3 MB │ 444.3 MB │ 444.3 MB │ 444.3 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ 32768 │ │ │ │ │ -│ ╰─ commonmark_spec 177.1 ms │ 184.9 ms │ 178.2 ms │ 179.3 ms │ 100 │ 100 -│ 1.157 MB/s │ 1.108 MB/s │ 1.149 MB/s │ 1.143 MB/s │ │ +│ ╰─ commonmark_spec 177.4 ms │ 186.2 ms │ 181.3 ms │ 180.9 ms │ 100 │ 100 +│ 1.155 MB/s │ 1.1 MB/s │ 1.13 MB/s │ 1.132 MB/s │ │ │ alloc: │ │ │ │ │ -│ 4579919 │ 4579919 │ 4579919 │ 4579919 │ │ -│ 460.2 MB │ 460.2 MB │ 460.2 MB │ 460.2 MB │ │ +│ 4580000 │ 4580000 │ 4580000 │ 4580000 │ │ +│ 460.3 MB │ 460.3 MB │ 460.3 MB │ 460.3 MB │ │ │ dealloc: │ │ │ │ │ -│ 4637972 │ 4637972 │ 4637972 │ 4637972 │ │ +│ 4638053 │ 4638053 │ 4638053 │ 4638053 │ │ │ 698.6 MB │ 698.6 MB │ 698.6 MB │ 698.6 MB │ │ │ grow: │ │ │ │ │ │ 79598 │ 79598 │ 79598 │ 79598 │ │ @@ -191,8 +191,8 @@ chunk_size fastest │ slowest │ median ╰─ text │ │ │ │ │ ├─ characters │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 208.2 ms │ 222 ms │ 208.9 ms │ 209.4 ms │ 100 │ 100 - │ │ │ 785.6 KB/s │ 736.9 KB/s │ 783 KB/s │ 781.3 KB/s │ │ + │ │ ├─ romeo_and_juliet 222.8 ms │ 226 ms │ 223.2 ms │ 223.4 ms │ 100 │ 100 + │ │ │ 734.3 KB/s │ 723.7 KB/s │ 732.7 KB/s │ 732.3 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 11187 │ 11187 │ 11187 │ 11189 │ │ │ │ │ 32.32 MB │ 32.32 MB │ 32.32 MB │ 32.32 MB │ │ @@ -205,8 +205,8 @@ chunk_size fastest │ slowest │ median │ │ │ shrink: │ │ │ │ │ │ │ │ 0 │ 0 │ 0 │ 0.05 │ │ │ │ │ 0 B │ 0 B │ 0 B │ 23.4 B │ │ - │ │ ╰─ room_with_a_view 162.1 ms │ 165 ms │ 162.4 ms │ 162.6 ms │ 100 │ 100 - │ │ 1.861 MB/s │ 1.828 MB/s │ 1.857 MB/s │ 1.856 MB/s │ │ + │ │ ╰─ room_with_a_view 173.7 ms │ 176.9 ms │ 174.1 ms │ 174.2 ms │ 100 │ 100 + │ │ 1.737 MB/s │ 1.706 MB/s │ 1.733 MB/s │ 1.732 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 18429 │ 18429 │ 18429 │ 18429 │ │ │ │ 26.32 MB │ 26.32 MB │ 26.32 MB │ 26.32 MB │ │ @@ -217,8 +217,8 @@ chunk_size fastest │ slowest │ median │ │ 48813 │ 48813 │ 48813 │ 48813 │ │ │ │ 66.19 MB │ 66.19 MB │ 66.19 MB │ 66.19 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 24.83 ms │ 25.61 ms │ 24.92 ms │ 24.94 ms │ 100 │ 100 - │ │ │ 6.587 MB/s │ 6.387 MB/s │ 6.563 MB/s │ 6.56 MB/s │ │ + │ │ ├─ romeo_and_juliet 24.97 ms │ 27.09 ms │ 25.24 ms │ 25.26 ms │ 100 │ 100 + │ │ │ 6.55 MB/s │ 6.037 MB/s │ 6.481 MB/s │ 6.476 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 1199 │ 1199 │ 1199 │ 1199 │ │ │ │ │ 3.479 MB │ 3.479 MB │ 3.479 MB │ 3.479 MB │ │ @@ -228,8 +228,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 3593 │ 3593 │ 3593 │ 3593 │ │ │ │ │ 9.941 MB │ 9.941 MB │ 9.941 MB │ 9.941 MB │ │ - │ │ ╰─ room_with_a_view 25.42 ms │ 26.42 ms │ 25.49 ms │ 25.51 ms │ 100 │ 100 - │ │ 11.87 MB/s │ 11.42 MB/s │ 11.84 MB/s │ 11.83 MB/s │ │ + │ │ ╰─ room_with_a_view 25.95 ms │ 26.93 ms │ 26.14 ms │ 26.16 ms │ 100 │ 100 + │ │ 11.63 MB/s │ 11.2 MB/s │ 11.54 MB/s │ 11.53 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 2348 │ 2348 │ 2348 │ 2348 │ │ │ │ 3.353 MB │ 3.353 MB │ 3.353 MB │ 3.353 MB │ │ @@ -240,8 +240,8 @@ chunk_size fastest │ slowest │ median │ │ 6217 │ 6217 │ 6217 │ 6217 │ │ │ │ 8.522 MB │ 8.522 MB │ 8.522 MB │ 8.522 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 3.707 ms │ 4.07 ms │ 3.804 ms │ 3.808 ms │ 100 │ 100 - │ │ │ 44.12 MB/s │ 40.2 MB/s │ 43 MB/s │ 42.96 MB/s │ │ + │ │ ├─ romeo_and_juliet 3.745 ms │ 4.251 ms │ 3.89 ms │ 3.896 ms │ 100 │ 100 + │ │ │ 43.68 MB/s │ 38.48 MB/s │ 42.05 MB/s │ 41.98 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 140 │ 140 │ 140 │ 140 │ │ │ │ │ 406.1 KB │ 406.1 KB │ 406.1 KB │ 406.1 KB │ │ @@ -251,8 +251,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 424 │ 424 │ 424 │ 424 │ │ │ │ │ 1.42 MB │ 1.42 MB │ 1.42 MB │ 1.42 MB │ │ - │ │ ╰─ room_with_a_view 4.971 ms │ 5.284 ms │ 5.041 ms │ 5.047 ms │ 100 │ 100 - │ │ 60.72 MB/s │ 57.12 MB/s │ 59.88 MB/s │ 59.81 MB/s │ │ + │ │ ╰─ room_with_a_view 5.155 ms │ 5.47 ms │ 5.237 ms │ 5.244 ms │ 100 │ 100 + │ │ 58.56 MB/s │ 55.19 MB/s │ 57.64 MB/s │ 57.57 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 303 │ 303 │ 303 │ 303 │ │ │ │ 430 KB │ 430 KB │ 430 KB │ 430 KB │ │ @@ -263,8 +263,8 @@ chunk_size fastest │ slowest │ median │ │ 810 │ 810 │ 810 │ 810 │ │ │ │ 1.154 MB │ 1.154 MB │ 1.154 MB │ 1.154 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.519 ms │ 1.767 ms │ 1.6 ms │ 1.606 ms │ 100 │ 100 - │ │ 107.7 MB/s │ 92.57 MB/s │ 102.2 MB/s │ 101.8 MB/s │ │ + │ ├─ romeo_and_juliet 1.584 ms │ 1.774 ms │ 1.66 ms │ 1.659 ms │ 100 │ 100 + │ │ 103.2 MB/s │ 92.2 MB/s │ 98.51 MB/s │ 98.58 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 32 │ 32 │ 32 │ 32 │ │ │ │ 106.9 KB │ 106.9 KB │ 106.9 KB │ 106.9 KB │ │ @@ -274,8 +274,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 105 │ 105 │ 105 │ 105 │ │ │ │ 597.5 KB │ 597.5 KB │ 597.5 KB │ 597.5 KB │ │ - │ ╰─ room_with_a_view 1.768 ms │ 1.947 ms │ 1.82 ms │ 1.827 ms │ 100 │ 100 - │ 170.7 MB/s │ 154.9 MB/s │ 165.8 MB/s │ 165.2 MB/s │ │ + │ ╰─ room_with_a_view 1.883 ms │ 2.02 ms │ 1.897 ms │ 1.902 ms │ 100 │ 100 + │ 160.2 MB/s │ 149.4 MB/s │ 159.1 MB/s │ 158.7 MB/s │ │ │ alloc: │ │ │ │ │ │ 41 │ 41 │ 41 │ 41 │ │ │ 55.95 KB │ 55.95 KB │ 55.95 KB │ 55.95 KB │ │ @@ -287,8 +287,8 @@ chunk_size fastest │ slowest │ median │ 206.2 KB │ 206.2 KB │ 206.2 KB │ 206.2 KB │ │ ├─ tiktoken │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 804.4 ms │ 950.7 ms │ 810.7 ms │ 814.4 ms │ 100 │ 100 - │ │ │ 203.3 KB/s │ 172 KB/s │ 201.8 KB/s │ 200.9 KB/s │ │ + │ │ ├─ romeo_and_juliet 798 ms │ 870.9 ms │ 801.1 ms │ 806.8 ms │ 100 │ 100 + │ │ │ 205 KB/s │ 187.8 KB/s │ 204.2 KB/s │ 202.7 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 8687900 │ 8687900 │ 8687900 │ 8687900 │ │ │ │ │ 413.2 MB │ 413.2 MB │ 413.2 MB │ 413.2 MB │ │ @@ -298,8 +298,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 1801554 │ 1801554 │ 1801554 │ 1801554 │ │ │ │ │ 245.2 MB │ 245.2 MB │ 245.2 MB │ 245.2 MB │ │ - │ │ ╰─ room_with_a_view 1.068 s │ 1.265 s │ 1.104 s │ 1.116 s │ 100 │ 100 - │ │ 282.6 KB/s │ 238.6 KB/s │ 273.4 KB/s │ 270.3 KB/s │ │ + │ │ ╰─ room_with_a_view 1.057 s │ 1.096 s │ 1.062 s │ 1.063 s │ 100 │ 100 + │ │ 285.6 KB/s │ 275.2 KB/s │ 284 KB/s │ 284 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 11500302 │ 11500302 │ 11500302 │ 11500302 │ │ │ │ 551.9 MB │ 551.9 MB │ 551.9 MB │ 551.9 MB │ │ @@ -310,8 +310,8 @@ chunk_size fastest │ slowest │ median │ │ 2834268 │ 2834268 │ 2834268 │ 2834268 │ │ │ │ 370.6 MB │ 370.6 MB │ 370.6 MB │ 370.6 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 281.6 ms │ 303.8 ms │ 284.9 ms │ 288.1 ms │ 100 │ 100 - │ │ │ 580.9 KB/s │ 538.4 KB/s │ 574.1 KB/s │ 567.8 KB/s │ │ + │ │ ├─ romeo_and_juliet 260.3 ms │ 263.4 ms │ 261.2 ms │ 261.3 ms │ 100 │ 100 + │ │ │ 628.4 KB/s │ 621.1 KB/s │ 626.2 KB/s │ 626 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 2921739 │ 2921739 │ 2921739 │ 2921739 │ │ │ │ │ 137.3 MB │ 137.3 MB │ 137.3 MB │ 137.3 MB │ │ @@ -321,8 +321,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 606067 │ 606067 │ 606067 │ 606067 │ │ │ │ │ 78.61 MB │ 78.61 MB │ 78.61 MB │ 78.61 MB │ │ - │ │ ╰─ room_with_a_view 479.2 ms │ 494.8 ms │ 484.3 ms │ 484.4 ms │ 100 │ 100 - │ │ 629.9 KB/s │ 610.1 KB/s │ 623.3 KB/s │ 623.2 KB/s │ │ + │ │ ╰─ room_with_a_view 438.9 ms │ 444.8 ms │ 440.6 ms │ 440.7 ms │ 100 │ 100 + │ │ 687.8 KB/s │ 678.7 KB/s │ 685.1 KB/s │ 684.9 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 4881127 │ 4881127 │ 4881127 │ 4881127 │ │ │ │ 232.3 MB │ 232.3 MB │ 232.3 MB │ 232.3 MB │ │ @@ -333,8 +333,8 @@ chunk_size fastest │ slowest │ median │ │ 1197367 │ 1197367 │ 1197367 │ 1197367 │ │ │ │ 152.9 MB │ 152.9 MB │ 152.9 MB │ 152.9 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 193.2 ms │ 205.3 ms │ 196.1 ms │ 196.3 ms │ 100 │ 100 - │ │ │ 846.4 KB/s │ 796.8 KB/s │ 834.1 KB/s │ 833.4 KB/s │ │ + │ │ ├─ romeo_and_juliet 177.8 ms │ 181 ms │ 178.7 ms │ 178.8 ms │ 100 │ 100 + │ │ │ 919.8 KB/s │ 903.6 KB/s │ 915.3 KB/s │ 914.7 KB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 2018345 │ 2018345 │ 2018345 │ 2018345 │ │ │ │ │ 94.51 MB │ 94.51 MB │ 94.51 MB │ 94.51 MB │ │ @@ -344,8 +344,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 418449 │ 418449 │ 418449 │ 418449 │ │ │ │ │ 53.36 MB │ 53.36 MB │ 53.36 MB │ 53.36 MB │ │ - │ │ ╰─ room_with_a_view 352 ms │ 361.4 ms │ 355 ms │ 355.4 ms │ 100 │ 100 - │ │ 857.5 KB/s │ 835.2 KB/s │ 850.2 KB/s │ 849.2 KB/s │ │ + │ │ ╰─ room_with_a_view 320 ms │ 324.7 ms │ 321.5 ms │ 321.5 ms │ 100 │ 100 + │ │ 943.3 KB/s │ 929.6 KB/s │ 939 KB/s │ 938.8 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 3573120 │ 3573120 │ 3573120 │ 3573120 │ │ │ │ 169.7 MB │ 169.7 MB │ 169.7 MB │ 169.7 MB │ │ @@ -356,8 +356,8 @@ chunk_size fastest │ slowest │ median │ │ 874505 │ 874505 │ 874505 │ 874505 │ │ │ │ 110.8 MB │ 110.8 MB │ 110.8 MB │ 110.8 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 90.55 ms │ 94.8 ms │ 92.07 ms │ 92.26 ms │ 100 │ 100 - │ │ 1.806 MB/s │ 1.725 MB/s │ 1.777 MB/s │ 1.773 MB/s │ │ + │ ├─ romeo_and_juliet 81.97 ms │ 84.55 ms │ 82.41 ms │ 82.43 ms │ 100 │ 100 + │ │ 1.995 MB/s │ 1.935 MB/s │ 1.985 MB/s │ 1.984 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 914680 │ 914680 │ 914680 │ 914680 │ │ │ │ 42.85 MB │ 42.85 MB │ 42.85 MB │ 42.85 MB │ │ @@ -367,8 +367,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 187705 │ 187705 │ 187705 │ 187705 │ │ │ │ 24.37 MB │ 24.37 MB │ 24.37 MB │ 24.37 MB │ │ - │ ╰─ room_with_a_view 124.3 ms │ 130.2 ms │ 126.9 ms │ 126.9 ms │ 100 │ 100 - │ 2.427 MB/s │ 2.318 MB/s │ 2.378 MB/s │ 2.378 MB/s │ │ + │ ╰─ room_with_a_view 112.1 ms │ 115.5 ms │ 112.8 ms │ 112.9 ms │ 100 │ 100 + │ 2.692 MB/s │ 2.611 MB/s │ 2.675 MB/s │ 2.673 MB/s │ │ │ alloc: │ │ │ │ │ │ 1232442 │ 1232442 │ 1232442 │ 1232442 │ │ │ 58.6 MB │ 58.6 MB │ 58.6 MB │ 58.6 MB │ │ @@ -380,8 +380,8 @@ chunk_size fastest │ slowest │ median │ 38.19 MB │ 38.19 MB │ 38.19 MB │ 38.19 MB │ │ ╰─ tokenizers │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.479 s │ 1.58 s │ 1.51 s │ 1.514 s │ 100 │ 100 - │ │ 110.5 KB/s │ 103.5 KB/s │ 108.3 KB/s │ 108 KB/s │ │ + │ ├─ romeo_and_juliet 1.407 s │ 1.457 s │ 1.411 s │ 1.413 s │ 100 │ 100 + │ │ 116.2 KB/s │ 112.2 KB/s │ 115.9 KB/s │ 115.7 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 29188727 │ 29188727 │ 29188727 │ 29188727 │ │ │ │ 3.601 GB │ 3.601 GB │ 3.601 GB │ 3.601 GB │ │ @@ -391,8 +391,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 463030 │ 463030 │ 463030 │ 463030 │ │ │ │ 1.608 GB │ 1.608 GB │ 1.608 GB │ 1.608 GB │ │ - │ ╰─ room_with_a_view 2.063 s │ 2.089 s │ 2.074 s │ 2.075 s │ 100 │ 100 - │ 146.3 KB/s │ 144.4 KB/s │ 145.5 KB/s │ 145.4 KB/s │ │ + │ ╰─ room_with_a_view 1.944 s │ 2.176 s │ 1.952 s │ 1.965 s │ 100 │ 100 + │ 155.2 KB/s │ 138.6 KB/s │ 154.6 KB/s │ 153.5 KB/s │ │ │ alloc: │ │ │ │ │ │ 39390415 │ 39390415 │ 39390415 │ 39390415 │ │ │ 5.158 GB │ 5.158 GB │ 5.158 GB │ 5.158 GB │ │ @@ -403,8 +403,8 @@ chunk_size fastest │ slowest │ median │ 681203 │ 681203 │ 681203 │ 681203 │ │ │ 2.257 GB │ 2.257 GB │ 2.257 GB │ 2.257 GB │ │ ├─ 512 │ │ │ │ │ - │ ├─ romeo_and_juliet 465.3 ms │ 476.5 ms │ 469.4 ms │ 469.4 ms │ 100 │ 100 - │ │ 351.6 KB/s │ 343.3 KB/s │ 348.5 KB/s │ 348.5 KB/s │ │ + │ ├─ romeo_and_juliet 439.6 ms │ 478.3 ms │ 441.5 ms │ 443.1 ms │ 100 │ 100 + │ │ 372.1 KB/s │ 342 KB/s │ 370.5 KB/s │ 369.2 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 9331640 │ 9331640 │ 9331640 │ 9331640 │ │ │ │ 1.159 GB │ 1.159 GB │ 1.159 GB │ 1.159 GB │ │ @@ -414,8 +414,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 100057 │ 100057 │ 100057 │ 100057 │ │ │ │ 513.7 MB │ 513.7 MB │ 513.7 MB │ 513.7 MB │ │ - │ ╰─ room_with_a_view 853.3 ms │ 907.1 ms │ 861.3 ms │ 865.8 ms │ 100 │ 100 - │ 353.7 KB/s │ 332.8 KB/s │ 350.5 KB/s │ 348.6 KB/s │ │ + │ ╰─ room_with_a_view 800.8 ms │ 812.7 ms │ 803.8 ms │ 804 ms │ 100 │ 100 + │ 377 KB/s │ 371.4 KB/s │ 375.5 KB/s │ 375.5 KB/s │ │ │ alloc: │ │ │ │ │ │ 16335239 │ 16335239 │ 16335239 │ 16335239 │ │ │ 2.154 GB │ 2.154 GB │ 2.154 GB │ 2.154 GB │ │ @@ -426,8 +426,8 @@ chunk_size fastest │ slowest │ median │ 165453 │ 165453 │ 165453 │ 165453 │ │ │ 945.8 MB │ 945.8 MB │ 945.8 MB │ 945.8 MB │ │ ├─ 4096 │ │ │ │ │ - │ ├─ romeo_and_juliet 318.5 ms │ 349.5 ms │ 321.8 ms │ 322.6 ms │ 100 │ 100 - │ │ 513.6 KB/s │ 468 KB/s │ 508.3 KB/s │ 507.1 KB/s │ │ + │ ├─ romeo_and_juliet 299.2 ms │ 305.3 ms │ 300.4 ms │ 300.6 ms │ 100 │ 100 + │ │ 546.7 KB/s │ 535.8 KB/s │ 544.5 KB/s │ 544.2 KB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 6432739 │ 6432739 │ 6432739 │ 6432739 │ │ │ │ 802.5 MB │ 802.5 MB │ 802.5 MB │ 802.5 MB │ │ @@ -437,8 +437,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 34248 │ 34248 │ 34248 │ 34248 │ │ │ │ 356.8 MB │ 356.8 MB │ 356.8 MB │ 356.8 MB │ │ - │ ╰─ room_with_a_view 607.1 ms │ 618.4 ms │ 612.7 ms │ 612.6 ms │ 100 │ 100 - │ 497.2 KB/s │ 488.2 KB/s │ 492.7 KB/s │ 492.8 KB/s │ │ + │ ╰─ room_with_a_view 566.8 ms │ 656.5 ms │ 569.7 ms │ 571.6 ms │ 100 │ 100 + │ 532.6 KB/s │ 459.8 KB/s │ 529.8 KB/s │ 528.1 KB/s │ │ │ alloc: │ │ │ │ │ │ 11604278 │ 11604278 │ 11604278 │ 11604278 │ │ │ 1.536 GB │ 1.536 GB │ 1.536 GB │ 1.536 GB │ │ @@ -449,8 +449,8 @@ chunk_size fastest │ slowest │ median │ 55752 │ 55752 │ 55752 │ 55752 │ │ │ 674.3 MB │ 674.3 MB │ 674.3 MB │ 674.3 MB │ │ ╰─ 32768 │ │ │ │ │ - ├─ romeo_and_juliet 143.6 ms │ 160.8 ms │ 146 ms │ 146.3 ms │ 100 │ 100 - │ 1.138 MB/s │ 1.017 MB/s │ 1.12 MB/s │ 1.118 MB/s │ │ + ├─ romeo_and_juliet 133 ms │ 135.5 ms │ 133.6 ms │ 133.7 ms │ 100 │ 100 + │ 1.23 MB/s │ 1.207 MB/s │ 1.224 MB/s │ 1.223 MB/s │ │ │ alloc: │ │ │ │ │ │ 2845251 │ 2845251 │ 2845251 │ 2845251 │ │ │ 353.7 MB │ 353.7 MB │ 353.7 MB │ 353.7 MB │ │ @@ -460,8 +460,8 @@ chunk_size fastest │ slowest │ median │ grow: │ │ │ │ │ │ 9648 │ 9648 │ 9648 │ 9648 │ │ │ 158.8 MB │ 158.8 MB │ 158.8 MB │ 158.8 MB │ │ - ╰─ room_with_a_view 238.4 ms │ 264.2 ms │ 242.4 ms │ 243 ms │ 100 │ 100 - 1.266 MB/s │ 1.142 MB/s │ 1.245 MB/s │ 1.242 MB/s │ │ + ╰─ room_with_a_view 219.7 ms │ 223.5 ms │ 220.2 ms │ 220.4 ms │ 100 │ 100 + 1.373 MB/s │ 1.35 MB/s │ 1.37 MB/s │ 1.369 MB/s │ │ alloc: │ │ │ │ │ 4490083 │ 4490083 │ 4490083 │ 4490083 │ │ 594.3 MB │ 594.3 MB │ 594.3 MB │ 594.3 MB │ │ diff --git a/src/lib.rs b/src/lib.rs index 98147e6..be2a517 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -267,18 +267,21 @@ trait SemanticSplit { /// Generate a new instance from a given text. fn new(text: &str) -> Self; - /// Retrieve ranges for each semantic level in the entire text - fn ranges(&self) -> impl Iterator)> + '_; + /// Retrieve ranges for each semantic level in the entire text that appear after a given offset + fn ranges_after_offset( + &self, + offset: usize, + ) -> impl Iterator)> + '_; /// Retrieve ranges for all sections of a given level after an offset - fn ranges_after_offset( + fn level_ranges_after_offset( &self, offset: usize, level: Self::Level, ) -> impl Iterator)> + '_ { - let first_item = self.ranges().find(|(l, _)| l == &level); - self.ranges() - .filter(move |(l, sep)| l >= &level && sep.start >= offset) + let first_item = self.ranges_after_offset(offset).find(|(l, _)| l == &level); + self.ranges_after_offset(offset) + .filter(move |(l, _)| l >= &level) .skip_while(move |(l, r)| { first_item.is_some_and(|(_, fir)| l > &level && r.contains(&fir.start)) }) @@ -287,11 +290,7 @@ trait SemanticSplit { /// Return a unique, sorted list of all line break levels present before the next max level, added /// to all of the base semantic levels, in order from smallest to largest fn levels_in_remaining_text(&self, offset: usize) -> impl Iterator + '_ { - let existing_levels = self - .ranges() - // Only start taking them from the offset - .filter(|(_, sep)| sep.start >= offset) - .map(|(l, _)| l); + let existing_levels = self.ranges_after_offset(offset).map(|(l, _)| l); Self::PERSISTENT_LEVELS .iter() diff --git a/src/markdown.rs b/src/markdown.rs index b3bec9b..f4fa376 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -409,8 +409,13 @@ impl SemanticSplit for Markdown { Self { ranges } } - fn ranges(&self) -> impl Iterator)> + '_ { - self.ranges.iter() + fn ranges_after_offset( + &self, + offset: usize, + ) -> impl Iterator)> + '_ { + self.ranges + .iter() + .filter(move |(_, sep)| sep.start >= offset) } /// Split a given text into iterator over each semantic chunk @@ -447,7 +452,7 @@ impl SemanticSplit for Markdown { | SemanticLevel::Rule | SemanticLevel::Metadata => Self::split_str_by_separator( text, - self.ranges_after_offset(offset, semantic_level) + self.level_ranges_after_offset(offset, semantic_level) .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)), ) .map(move |(i, str)| (offset + i, str)), @@ -656,7 +661,7 @@ mod tests { &(SemanticLevel::Block, 0..41), &(SemanticLevel::Text, 0..41) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -686,7 +691,7 @@ mod tests { ), &(SemanticLevel::Text, 28..42), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -703,7 +708,7 @@ mod tests { 8..12 ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -719,7 +724,7 @@ mod tests { 0..6 ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -736,7 +741,7 @@ mod tests { ), &(SemanticLevel::Text, 1..9), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -753,7 +758,7 @@ mod tests { ), &(SemanticLevel::Text, 2..10), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -770,7 +775,7 @@ mod tests { ), &(SemanticLevel::Text, 2..10), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -787,7 +792,7 @@ mod tests { ), &(SemanticLevel::Text, 1..5), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -804,7 +809,7 @@ mod tests { ), &(SemanticLevel::Text, 2..6), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -825,7 +830,7 @@ mod tests { 15..22 ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -841,7 +846,7 @@ mod tests { ), &(SemanticLevel::Block, 0..20) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -880,7 +885,7 @@ mod tests { ), &(SemanticLevel::Text, 49..55) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -895,7 +900,7 @@ mod tests { &(SemanticLevel::SoftBreak, 9..10), &(SemanticLevel::Text, 10..26) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -913,7 +918,7 @@ mod tests { ), &(SemanticLevel::Text, 11..27) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -927,7 +932,7 @@ mod tests { &(SemanticLevel::Block, 10..18), &(SemanticLevel::Text, 10..18) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -937,7 +942,7 @@ mod tests { assert_eq!( vec![&(SemanticLevel::Block, 0..12), &(SemanticLevel::Text, 4..9)], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -954,7 +959,7 @@ mod tests { &(SemanticLevel::Block, 2..7), &(SemanticLevel::Text, 2..7) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -970,7 +975,7 @@ mod tests { &(SemanticLevel::Block, 16..27), &(SemanticLevel::Text, 16..27) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -994,7 +999,7 @@ mod tests { &(SemanticLevel::Heading(level), 0..9 + index), &(SemanticLevel::Text, 2 + index..9 + index) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } } @@ -1006,7 +1011,7 @@ mod tests { assert_eq!( vec![&(SemanticLevel::MetaContainer, 0..42),], markdown - .ranges_after_offset(0, SemanticLevel::MetaContainer) + .level_ranges_after_offset(0, SemanticLevel::MetaContainer) .collect::>() ); } @@ -1027,7 +1032,10 @@ mod tests { ), ], markdown - .ranges_after_offset(0, SemanticLevel::ContainerBlock(SemanticSplitPosition::Own)) + .level_ranges_after_offset( + 0, + SemanticLevel::ContainerBlock(SemanticSplitPosition::Own) + ) .collect::>() ); } diff --git a/src/text.rs b/src/text.rs index a953afb..14cc1e5 100644 --- a/src/text.rs +++ b/src/text.rs @@ -254,8 +254,13 @@ impl SemanticSplit for LineBreaks { } /// Retrieve ranges for all sections of a given level after an offset - fn ranges(&self) -> impl Iterator)> + '_ { - self.line_breaks.iter() + fn ranges_after_offset( + &self, + offset: usize, + ) -> impl Iterator)> + '_ { + self.line_breaks + .iter() + .filter(move |(_, sep)| sep.start >= offset) } /// Split a given text into iterator over each semantic chunk @@ -284,7 +289,7 @@ impl SemanticSplit for LineBreaks { .map(move |(i, str)| (offset + i, str)), SemanticLevel::LineBreak(_) => Self::split_str_by_separator( text, - self.ranges_after_offset(offset, semantic_level) + self.level_ranges_after_offset(offset, semantic_level) .map(move |(_, sep)| sep.start - offset..sep.end - offset), ) .map(move |(i, str)| (offset + i, str)), diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap index ba351de..9a85d53 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap @@ -11,11 +11,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nThe point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\nThis paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:\n" - "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n This paragraph belongs to item two of the outer list.\n```\n\nThe AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document.\n\n" - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n" -- " lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n4. What is the exact rule for determining when list items get\n wrapped in `

` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n" -- "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n" -- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```\n\nIn the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away.\n\n" +- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n" +- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n" +- "4. What is the exact rule for determining when list items get\n wrapped in `

` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n" +- "11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```\n\n" +- "In the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away.\n\n" - "## About this document\n\nThis document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\nof representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer.\n" - "\nNote that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\nimplementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\npercent-encode non-ASCII characters in URLs.\n\nThis document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs.\n\n" - "# Preliminaries\n\n" @@ -118,10 +119,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n``` aa ```\nfoo\n.\n

aa\nfoo

\n````````````````````````````````\n\n\n[Info strings] for tilde code blocks can contain backticks and tildes:\n\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
foo\n
\n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:\n" - "\n```````````````````````````````` example\n```\n``` aaa\n```\n.\n
``` aaa\n
\n````````````````````````````````\n\n\n\n" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line.\n\n" -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n``, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n" -- "``.\n\n" -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n``, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n" +- "5. **Start condition:** line begins with the string\n``.\n\n" +- "6. " +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n"
 - "\n```````````````````````````````` example\n
\n
\n**Hello**,\n\n_world_.\n
\n
\n.\n
\n
\n**Hello**,\n

world.\n

\n
\n````````````````````````````````\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n" - "\nAll types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
\n hi\n
\n\nokay.\n.\n\n \n \n \n
\n hi\n
\n

okay.

\n````````````````````````````````" @@ -194,8 +196,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
\n
\n
\n

foo\nbar\nbaz

\n
\n
\n
\n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:\n" - "\n```````````````````````````````` example\n> code\n\n> not code\n.\n
\n
code\n
\n
\n
\n

not code

\n
\n````````````````````````````````\n\n\n\n" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:\n\n" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. " -- "If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\nFor example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

A paragraph\nwith two lines.

\n
indented code\n
\n
\n

A block quote.

\n
\n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n" +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n" +- "\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" +- "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

A paragraph\nwith two lines.

\n
indented code\n
\n
\n

A block quote.

\n
\n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n" - "\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
    \n
  1. \n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n
  2. \n
\n````````````````````````````````\n\n\nThe most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n" - "\nHere are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
    \n
  • one
  • \n
\n

two

\n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
    \n
  • \n

    one

    \n

    two

    \n
  • \n
\n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
    \n
  • one
  • \n
\n
 two\n
\n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
    \n
  • \n

    one

    \n

    two

    \n
  • \n
\n````````````````````````````````" @@ -284,12 +287,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nA [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" - " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n" -- " iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" +- "5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n" - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n" -- " `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\n" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n" +- "16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\n" - "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

foo bar

\n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

a * foo bar*

\n````````````````````````````````" - "\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

a*"foo"*

\n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

* a *

\n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n" - "\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

*$*alpha.

\n

*£*bravo.

\n

*€*charlie.

\n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

foobar

\n````````````````````````````````" @@ -491,5 +494,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter.\n\n" - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n" +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:\n\n " +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap index cd84420..3597de6 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap @@ -30,9 +30,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "articles, slide shows, letters, and lecture notes.\n\n" - "What distinguishes Markdown from many other lightweight markup\n" - "syntaxes, which are often easier to write, is its readability.\n" -- "As Gruber writes:\n\n" -- "> The overriding design goal for Markdown's formatting syntax is\n" -- "> to make it as readable as possible. The idea is that a\n> " +- "As Gruber writes:" +- "\n\n> The overriding design goal for Markdown's formatting syntax is" +- "\n> to make it as readable as possible. The idea is that a\n> " - "Markdown-formatted document should be publishable as-is, as\n> " - "plain text, without looking like it's been marked up with tags\n> " - "or formatting instructions.\n> (" @@ -73,12 +73,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to read. The nesting of list items is apparent to the eye in the\n" - "source, not just in the processed document.\n\n" - "## Why is a spec needed?\n\n" -- "John Gruber's [canonical description of Markdown's\n" -- "syntax](https://daringfireball.net/projects/" -- "markdown/syntax)\n" +- "John Gruber's " +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown" +- "/syntax)\n" - "does not specify the syntax unambiguously. Here are some examples of\n" -- "questions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? " +- "questions it does not answer:" +- "\n\n1. " +- "How much indentation is needed for a sublist? " - "The spec says that\n " - "continuation paragraphs need to be indented four spaces, but is\n " - "not fully explicit about sublists. It is natural to think that\n " @@ -87,36 +89,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "corner case,\" and divergences\n " - "between implementations on this issue often lead to surprises for\n " - "users in real documents. (See " -- "[this comment by John\n" -- " Gruber](https://web.archive.org/web" -- "/20170611172104/http://" +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/" +- "20170611172104/http://" - article.gmane.org/ -- "gmane.text.markdown.general/1997).)\n\n" -- "2. Is a blank line needed before a block quote or heading?\n" -- " Most implementations do not require the blank line. However,\n " +- gmane.text.markdown.general/1997).) +- "\n\n2. Is a blank line needed before a block quote or heading?\n " +- "Most implementations do not require the blank line. However,\n " - "this can lead to unexpected results in hard-wrapped text, and\n " - "also to ambiguities in parsing (note that some implementations\n " - "put the heading inside the blockquote, while others do not).\n " - "(John Gruber has also spoken " -- "[in favor of requiring the blank\n" -- " lines](https://web.archive.org/web/" +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ -- "gmane.text.markdown.general/2146).)\n\n" -- "3. Is a blank line needed before an indented code block?\n" -- " (`Markdown.pl` requires it, but this is not mentioned " -- "in the\n documentation, and some implementations do not require it.)\n\n " +- gmane.text.markdown.general/2146).) +- "\n\n3. " +- "Is a blank line needed before an indented code block?\n (" +- "`Markdown.pl` requires it, but this is not mentioned in " +- "the\n documentation, and some implementations do not require it.)\n\n " - "``` markdown\n paragraph\n code?\n ```\n\n" -- "4. What is the exact rule for determining when list items get\n" -- " wrapped in `

`" +- "4. " +- "What is the exact rule for determining when list items get\n wrapped in " +- "`

`" - " tags? Can a list be partially \"loose\" and partially\n \"tight\"" - "? What should we do with a list like this?\n\n " - "``` markdown\n 1. one\n\n 2. two\n 3. three\n" - " ```\n\n Or this?\n" - "\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n" - " ```\n\n " -- "(There are some relevant comments by John Gruber\n" -- " [here](https://web.archive.org/web/" +- "(There are some relevant comments by John Gruber\n " +- "[here](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ - "gmane.text.markdown.general/2554).)\n\n" @@ -124,32 +128,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Can ordered list markers be right-aligned?\n\n " - "``` markdown\n 8. item 1\n 9. item 2\n" - " 10. item 2a\n ```\n\n" -- "6. Is this one list with a thematic break in its second item,\n" -- " or two lists separated by a thematic break?\n\n " +- "6. Is this one list with a thematic break in its second item,\n " +- "or two lists separated by a thematic break?\n\n " - "``` markdown\n * a\n * * * * *\n * b\n" - " ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n" -- " two lists or one? (The Markdown syntax description suggests two,\n " +- "7. When list markers change from numbers to bullets, do we have\n " +- "two lists or one? (The Markdown syntax description suggests two,\n " - "but the perl scripts and many other implementations produce one.)\n\n " - "``` markdown\n 1. fee\n 2. fie\n - foe\n" - " - fum\n ```\n\n" -- "8. What are the precedence rules for the markers of inline structure?\n" -- " For example, is the following a valid link, or does the code span\n " +- "8. What are the precedence rules for the markers of inline structure?\n " +- "For example, is the following a valid link, or does the code span\n " - "take precedence ?\n\n " - "``` markdown\n" - " [a backtick (`)](/url) and [another " - "backtick (`)](/url).\n ```\n\n" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? " -- "For example, how should the following be parsed?\n\n " +- "9. What are the precedence rules for markers of emphasis and strong\n " +- "emphasis? For example, how should the following be parsed?\n\n " - "``` markdown\n *foo *bar* baz*\n" - " ```\n\n" -- "10. What are the precedence rules between block-level and inline-level\n" -- " structure? For example, how should the following be parsed?\n\n " +- "10. What are the precedence rules between block-level and inline-level\n " +- "structure? For example, how should the following be parsed?\n\n " - "``` markdown\n" - " - `a long code span can contain a hyphen like this\n " - " - and it can screw things up`\n ```\n\n" -- "11. Can list items include section headings? " -- "(`Markdown.pl`" +- "11. " +- "Can list items include section headings? (`Markdown.pl`" - " does not\n " - "allow this, but does allow blockquotes to include headings.)\n\n " - "``` markdown\n - # Heading\n ```\n\n" @@ -158,8 +162,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "13. Can link references be defined inside block quotes or list items?\n\n" - " ``` markdown\n > Blockquote [foo].\n >\n" - " > [foo]: /url\n ```\n\n" -- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n" -- " ``` markdown\n [foo]: /url1\n" +- "14. If there are multiple definitions for the same reference, which takes\n precedence?" +- "\n\n ``` markdown\n [foo]: /url1\n" - " [foo]: /url2\n\n [foo][]\n" - " ```\n\n" - "In the absence of a spec, early implementers consulted `" @@ -199,9 +203,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the expectations of the spec examples (percent-encoding\n" - "non-ASCII characters in URLs). " - "But a conforming implementation\ncan use a different renderer and may choose not to\n" -- "percent-encode non-ASCII characters in URLs.\n\n" -- "This document is generated from a text file, `spec.txt" -- "`, written\n" +- percent-encode non-ASCII characters in URLs. +- "\n\nThis document is generated from a text file, " +- "`spec.txt`" +- ", written\n" - "in Markdown with a small extension for the side-by-side tests.\n" - "The script `tools/makespec.py` can be used to convert " - "`spec.txt`" @@ -212,45 +217,44 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Preliminaries\n\n" - "## Characters and lines\n\n" - "Any sequence of [characters] is a valid CommonMark\ndocument.\n" -- "\nA [character](@) is a Unicode code point. " -- "Although some\ncode points (for example, combining accents) do not correspond to\n" +- "\nA [character](@)" +- " is a Unicode code point. Although some\n" +- "code points (for example, combining accents) do not correspond to\n" - "characters in an intuitive sense, all code points count as characters\n" - "for purposes of this spec.\n\n" -- "This spec does not specify an encoding; it thinks of lines as composed\n" -- "of [characters]" -- " rather than bytes. A conforming parser may be limited\n" +- "This spec does not specify an encoding; it thinks of lines as composed\nof " +- "[characters] rather than bytes. A conforming parser may be limited\n" - "to a certain encoding.\n\n" - "A [line](@) is a sequence of zero or more [characters" - "]\nother than line feed (`U+000A`" - ") or carriage return (`U+000D`" -- "),\nfollowed by a [line ending] or by the end of file.\n\n" -- "A [line ending](@) is a line feed (`U+" -- "000A`), a carriage return\n(`U+000D`" -- ") not followed by a line feed, or a carriage return and a\n" -- "following line feed.\n\n" -- "A line containing no characters, or a line containing only spaces\n" -- "(`U+0020`) or tabs (`U+" -- "0009`), is called a [blank line](@).\n\n" +- "),\nfollowed by a [line ending] or by the end of file." +- "\n\nA [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`) not followed by a line feed, or a " +- "carriage return and a\nfollowing line feed." +- "\n\nA line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (`U+0009" +- "`), is called a [blank line](@).\n\n" - "The following definitions of character classes will be used in this spec:\n" -- "\n" -- "A [Unicode whitespace character](@) is a character in " -- "the Unicode `Zs` general\ncategory, or a tab (" -- "`U+0009`), line feed (`U+000A" -- "`), form feed (`U+000C`" -- "), or\ncarriage return (`U+000D`).\n\n" +- "\nA [Unicode whitespace character](@)" +- " is a character in the Unicode `Zs`" +- " general\ncategory, or a tab (`U+0009`" +- "), line feed (`U+000A`), form feed (" +- "`U+000C`), or\ncarriage return (" +- "`U+000D`).\n\n" - "[Unicode whitespace](@) is a sequence of one or " - "more\n[Unicode whitespace characters].\n\n" - "A [tab](@) is `U+0009`.\n" - "\nA [space](@) is `U+0020`.\n" -- "\n" -- "An [ASCII control character](@) is a character between `" -- "U+0000–1F` (both\nincluding) or " +- "\nAn [ASCII control character](@) is a character between " +- "`U+0000–1F` (both\nincluding) or " - "`U+007F`.\n\n" -- "An [ASCII punctuation character](@)\n" -- "is `!`, `\"`, `#`, `$`" -- ", `%`, `&`, `'`, `(`" -- ", `)`,\n`*`, `+`, `,`" -- ", `-`, `.`, `/`" +- "An [ASCII punctuation character](@)\nis " +- "`!`, `\"`, `#`, `$`, " +- "`%`, `&`, `'`, `(`, " +- "`)`,\n`*`, `+`, `,`, " +- "`-`, `.`, `/`" - " (U+0021–2F), \n`:`, " - "`;`, `<`, `=`, `>`, " - "`?`, `@`" @@ -314,7 +318,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n" -- "Normally the `>` that begins a block quote may be followed\n" +- "Normally the `>`" +- " that begins a block quote may be followed\n" - "optionally by a space, which is not considered part of the\n" - "content. In the following case `>`" - " is followed by a tab,\n" @@ -513,10 +518,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Valid HTML entity references and numeric character references\n" - "can be used in place of the corresponding Unicode character,\n" - "with the following exceptions:\n\n" -- "- Entity and character references are not recognized in code\n" -- " blocks and code spans.\n\n" -- "- Entity and character references cannot stand in place of\n" -- " special characters that define structural elements in\n CommonMark. For example, although " +- "- Entity and character references are not recognized in code\n " +- blocks and code spans. +- "\n\n- Entity and character references cannot stand in place of\n " +- "special characters that define structural elements in\n CommonMark. For example, although " - "`*` can be used\n in place of a literal " - "`*` character, `*` cannot replace\n `*`" - " in emphasis delimiters, bullet list markers, or thematic\n breaks.\n\n" @@ -538,10 +543,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

  & © Æ Ď\n¾ ℋ ⅆ\n" - "∲ ≧̸

\n" - "````````````````" -- "````````````````\n\n\n" -- "[Decimal numeric character\nreferences](@)\n" -- "consist of `&#` + a string of 1--7 " -- "arabic digits + `;`" +- "````````````````\n\n" +- "\n[Decimal numeric character\nreferences](@)\nconsist of " +- "`&#` + a string of 1--7 arabic digits " +- "+ `;`" - ". A\nnumeric character reference is parsed as the corresponding\n" - "Unicode character. " - "Invalid Unicode code points will be replaced by\n" @@ -555,11 +560,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Ӓ Ϡ &#" - "0;\n.\n

# Ӓ Ϡ �

\n" - "````````````````" -- "````````````````\n\n\n" -- "[Hexadecimal numeric character\n" -- "references](@) consist of `&#` +\neither `X`" -- " or `x` + a string of 1-6 hexadecimal digits " -- "+ `;`" +- "````````````````\n\n" +- "\n[Hexadecimal numeric character\nreferences](@) consist of " +- "`&#` +\neither `X` or `x`" +- " + a string of 1-6 hexadecimal digits + `;`" - ".\nThey too are parsed as the corresponding Unicode character (this\n" - time specified with a hexadecimal numeral instead of decimal) - ".\n\n" @@ -707,8 +711,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "# Blocks and inlines\n\n" -- "We can think of a document as a sequence of\n" -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block\n" - "quotations, lists, headings, rules, and code blocks. " - "Some blocks (like\n" @@ -717,8 +720,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "--text,\n" - "links, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\n" -- "Indicators of block structure always take precedence over indicators\nof inline structure. " -- "So, for example, the following is a list with\n" +- "Indicators of block structure always take precedence over indicators\n" +- "of inline structure. So, for example, the following is a list with\n" - "two items, not a list with one item containing a code span:\n\n" - "````````````````" - "```````````````` " @@ -917,8 +920,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## ATX headings\n\n" -- "An [ATX heading](@)\n" -- "consists of a string of characters, parsed as inline content, between an\n" +- "An [ATX heading](@)" +- "\nconsists of a string of characters, parsed as inline content, between an\n" - "opening sequence of 1--6 unescaped `#` characters and an " - "optional\nclosing sequence of any number of unescaped `#`" - " characters.\nThe opening sequence of `#`" @@ -1055,7 +1058,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

foo

\n" - "````````````````" - "````````````````\n\n\n" -- "A sequence of `#` characters with anything but spaces or tabs following it\n" +- "A sequence of `#`" +- " characters with anything but spaces or tabs following it\n" - "is not a closing sequence, but counts as part of the contents of the\n" - "heading:\n\n" - "````````````````" @@ -1074,8 +1078,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

foo#

\n" - "````````````````" - "````````````````\n\n\n" -- "Backslash-escaped `#` characters do not count as part\n" -- "of the closing sequence:\n\n" +- "Backslash-escaped `#`" +- " characters do not count as part\nof the closing sequence:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1117,7 +1121,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Setext headings\n\n" -- "A [setext heading](@) consists of one or more\n" +- "A [setext heading](@)" +- " consists of one or more\n" - "lines of text, not interrupted by a blank line, of which the first line " - "does not\nhave more than 3 spaces of indentation, followed by\na [" - "setext heading underline]. The lines of text must be such\n" @@ -1130,10 +1135,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A [setext heading underline](@) is a sequence of\n" - "`=` characters or a sequence of `-` characters, with no more " - "than 3\n" -- "spaces of indentation and any number of trailing spaces or tabs.\n\n" -- "The heading is a level 1 heading if `=` characters are used in\n" -- "the [setext heading underline], and a level 2 heading if " -- "`-`" +- spaces of indentation and any number of trailing spaces or tabs. +- "\n\nThe heading is a level 1 heading if `=`" +- " characters are used in\nthe [setext heading underline]" +- ", and a level 2 heading if `-`" - "\ncharacters are used. The contents of the heading are the result\n" - "of parsing the preceding lines of text as CommonMark inline\ncontent.\n\n" - "In general, a setext heading need not be preceded or followed by a\n" @@ -1267,8 +1272,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

\n" - "````````````````" - "````````````````\n\n\n" -- "The setext heading underline cannot be a [lazy continuation\n" -- "line] in a list item or block quote:\n\n" +- "The setext heading underline cannot be a [lazy continuation\nline]" +- " in a list item or block quote:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1364,7 +1369,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

> foo

\n" - "````````````````" - "````````````````\n\n\n" -- "**Compatibility note:** Most existing Markdown implementations\n" +- "**Compatibility note:**" +- " Most existing Markdown implementations\n" - "do not allow the text of setext headings to span multiple lines.\n" - "But there is no consensus about how to interpret\n\n" - "``` markdown\nFoo\nbar\n---\nbaz\n```" @@ -1543,15 +1549,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## Fenced code blocks\n\n" -- "A [code fence](@) is a sequence\n" -- "of at least three consecutive backtick characters (`` ` ``" -- ") or\ntildes (`~`" +- "A [code fence](@)" +- " is a sequence\nof at least three consecutive backtick characters (" +- "`` ` ``) or\ntildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA " - "[fenced code block](@)" - "\n" - "begins with a code fence, preceded by up to three spaces of indentation" -- ".\n\n" -- "The line with the opening code fence may optionally contain some text\n" +- ".\n\nThe line with the opening code fence may optionally contain some text\n" - "following the code fence; this is trimmed of leading and trailing\n" - "spaces or tabs and called the [info string](@)" - ". If the [info string] comes\n" @@ -1559,8 +1564,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters. (The reason for this restriction is that otherwise\n" - "some inline code would be incorrectly interpreted as the\n" - "beginning of a fenced code block.)\n\n" -- "The content of the code block consists of all subsequent lines, until\n" -- "a closing [code fence] of the same type as the code block\n" +- "The content of the code block consists of all subsequent lines, until\na closing [" +- "code fence] of the same type as the code block\n" - "began with (backticks or tildes), and with at least as " - "many backticks\n" - "or tildes as the opening code fence. If the leading code fence is\n" @@ -1583,8 +1588,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "much less efficient, and there seems to be no real downside to the\n" - "behavior described here.)\n\n" - "A fenced code block may interrupt a paragraph, and does not require\n" -- "a blank line either before or after.\n\n" -- "The content of a code fence is treated as literal text, not parsed\n" +- a blank line either before or after. +- "\n\nThe content of a code fence is treated as literal text, not parsed\n" - "as inlines. The first word of the [info string]" - " is typically used to\nspecify the language of the code sample, and rendered in the " - "`class`\nattribute of the `code`" @@ -1868,17 +1873,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "An [HTML block](@) is a group of lines that " - "is treated\n" - as raw HTML (and will not be escaped in HTML output -- ").\n\n" -- "There are seven kinds of [HTML block], which can be defined " -- "by their\nstart and end conditions. The block begins with a line that meets a\n" +- ").\n\nThere are seven kinds of [HTML block]" +- ", which can be defined by their\n" +- "start and end conditions. The block begins with a line that meets a\n" - "[start condition](@) (after up to three optional spaces of " - "indentation).\nIt ends with the first subsequent line that meets a matching\n" - "[end condition](@), or the last line of the document, " - "or the last line of\nthe [container block](#container-blocks)" - " containing the current HTML\nblock, if no line is encountered that meets the " - "[end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line.\n\n" -- "1. " +- " and the [end\ncondition], the block will contain just that line." +- "\n\n1. " - "**Start condition:** line begins with the string ``.\n\n" - "4. " -- "**Start condition:** line begins with the string ``.\n\n" -- "5. **Start condition:** line begins with the string\n" +- "5. " +- "**Start condition:** line begins with the string\n" - "``" - ".\n\n" @@ -1933,15 +1939,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "**End condition:** line is followed by a [blank line]" - ".\n\n" - "7. " -- "**Start condition:** line begins with a complete [open tag]\n" -- "(with any [tag name] other than `pre`, `script`" -- ",\n`style`, or `textarea`" +- "**Start condition:**" +- " line begins with a complete [open tag]\n(with any [tag name]" +- " other than `pre`, `script`,\n`style`, or " +- "`textarea`" - ") or a complete [closing tag],\n" - "followed by zero or more spaces and tabs, followed by the end of the " - "line.\\\n**End condition:**" -- " line is followed by a [blank line].\n\n" -- "HTML blocks continue until they are closed by their appropriate\n" -- "[end condition], or the last line of the document or other " +- " line is followed by a [blank line]." +- "\n\nHTML blocks continue until they are closed by their appropriate\n[end condition]" +- ", or the last line of the document or other " - "[container\nblock](#container-blocks)" - ". This means any HTML " - "**within an HTML\nblock**" @@ -2150,8 +2157,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo*\n\n" - "````````````````" - "````````````````\n\n\n" -- "In this case, we get a raw HTML block that just includes\n" -- "the ``" +- "In this case, we get a raw HTML block that just includes\nthe " +- "``" - " tag (because it ends with the following blank\n" - "line). So the contents get interpreted as CommonMark:\n\n" - "````````````````" @@ -2162,8 +2169,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "Finally, in this case, the `` tags are interpreted\n" -- "as [raw HTML] *inside*" +- "Finally, in this case, the ``" +- " tags are interpreted\nas [raw HTML] *inside*" - " the CommonMark paragraph. (Because\n" - "the tag is not on a line by itself, we get inline " - "HTML\nrather than an [HTML block].)\n\n" @@ -2175,10 +2182,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/del>

\n" - "````````````````" - "````````````````\n\n\n" -- "HTML tags designed to contain literal content\n" -- "(`pre`, `script`, `style`, `" -- "textarea`), comments, processing instructions,\n" -- "and declarations are treated somewhat differently.\n" +- "HTML tags designed to contain literal content\n(`pre`, " +- "`script`, `style`, `textarea`), comments" +- ", processing instructions,\nand declarations are treated somewhat differently.\n" - "Instead of ending at the first blank line, these blocks\n" - "end at the first line containing a corresponding end tag.\n" - "As a result, these blocks can contain blank lines:\n\n" @@ -2283,8 +2289,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

baz

\n" - "````````````````" - "````````````````\n\n\n" -- "Note that anything on the last line after the\n" -- "end tag will be included in the [HTML block]:\n\n" +- "Note that anything on the last line after the\nend tag will be included in the " +- "[HTML block]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2386,19 +2392,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "This rule differs from John Gruber's original Markdown syntax\n" -- "specification, which says:\n\n" -- "> The only restrictions are that block-level HTML elements —\n" -- "> e.g. `
`, `" -- "`, `
`, `

`, etc. " -- "— must be separated from\n> " +- "specification, which says:" +- "\n\n> The only restrictions are that block-level HTML elements —\n> " +- "e.g. `

`, `
`" +- ", `
`, `

`, etc. — " +- "must be separated from\n> " - "surrounding content by blank lines, and the start and end tags of the\n> " - "block should not be indented with spaces or tabs.\n\n" - "In some ways Gruber's rule is more restrictive than the one " - "given\nhere:\n\n" -- "- It requires that an HTML block be preceded by a blank line.\n" -- "- It does not allow the start tag to be indented.\n" -- "- It requires a matching end tag, which it also does not allow to\n" -- " be indented.\n\n" +- "- It requires that an HTML block be preceded by a blank line." +- "\n- It does not allow the start tag to be indented.\n" +- "- It requires a matching end tag, which it also does not allow to\n " +- "be indented.\n\n" - "Most Markdown implementations (including some of Gruber's own) " - "do not\nrespect all of these restrictions.\n\n" - "There is one respect, however, in which Gruber's rule is " @@ -2434,8 +2440,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`markdown=1`" - ". The rule given above seems a simpler and\n" - "more elegant way of achieving the same expressive power, which is also\n" -- "much simpler to parse.\n\n" -- "The main potential drawback is that one can no longer paste HTML\n" +- much simpler to parse. +- "\n\nThe main potential drawback is that one can no longer paste HTML\n" - "blocks into Markdown documents with 100% reliability. However,\n" - "*in most cases*" - " this will work fine, because the blank lines in\n" @@ -2465,17 +2471,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

\n" - "````````````````" - "````````````````\n\n\n" -- "Fortunately, blank lines are usually not necessary and can be\ndeleted. " -- "The exception is inside `
`"
+- "Fortunately, blank lines are usually not necessary and can be\n"
+- "deleted.  The exception is inside `
`"
 - " tags, but as described\n[above][HTML blocks]"
 - ", raw HTML blocks starting with `
`\n*can*"
 - " contain blank lines.\n\n"
 - "## Link reference definitions\n\n"
-- "A [link reference definition](@)\n"
-- "consists of a [link label], optionally preceded by up to three spaces "
-- "of\nindentation, followed\nby a colon (`:`"
-- "), optional spaces or tabs (including up to one\n[line ending]"
-- "), a [link destination],\n"
+- "A [link reference definition](@)"
+- "\nconsists of a [link label]"
+- ", optionally preceded by up to three spaces of\nindentation, followed\n"
+- "by a colon (`:`), optional spaces or tabs ("
+- "including up to one\n[line ending]), a [link destination],\n"
 - "optional spaces or tabs (including up to one\n[line ending]"
 - "), and an optional [link\ntitle]"
 - ", which if it is present must be separated\nfrom the [link destination]"
@@ -2868,10 +2874,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md
 - " are meta-containers for [list items].\n\n"
 - "We define the syntax for container blocks recursively.  The general\n"
 - "form of the definition is:\n\n"
-- "> If X is a sequence of blocks, then the result of\n"
-- "> transforming X in such-and-such a way is a container of type "
-- "Y\n> with these blocks as its content.\n\n"
-- "So, we explain what counts as a block quote or list item by explaining\n"
+- "> If X is a sequence of blocks, then the result of\n> "
+- transforming X in such-and-such a way is a container of type Y
+- "\n> with these blocks as its content."
+- "\n\nSo, we explain what counts as a block quote or list item by explaining\n"
 - how these can be *generated*
 - " from their contents. This should suffice\n"
 - "to define the syntax, although it does not give a recipe for *parsing"
@@ -2879,22 +2885,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md
 - "[A parsing strategy](#appendix-a-parsing"
 - "-strategy).)\n\n"
 - "## Block quotes\n\n"
-- "A [block quote marker](@),\n"
-- "optionally preceded by up to three spaces of indentation,\n"
+- "A [block quote marker](@)"
+- ",\noptionally preceded by up to three spaces of indentation,\n"
 - "consists of (a) the character `>`"
 - " together with a following space of\n"
 - "indentation, or (b) a single character `>` not followed "
 - "by a space of\nindentation.\n\n"
 - "The following rules define [block quotes]:\n\n"
-- 1.  **Basic case.
-- "**  If a string of lines *Ls* constitute a sequence\n    of blocks "
-- "*Bs*"
+- "1.  "
+- "**Basic case.**  If a string of lines *Ls*"
+- " constitute a sequence\n    of blocks *Bs*"
 - ", then the result of prepending a [block quote\n    marker]"
 - " to the beginning of each line in *Ls*\n    is a "
 - "[block quote](#block-quotes) containing *Bs*.\n\n"
-- 2.  **Laziness.
-- "**  If a string of lines *Ls* constitute a "
-- "[block\n    quote](#block-quotes) with contents *Bs*"
+- "2.  "
+- "**Laziness.**  If a string of lines "
+- "*Ls* constitute a [block\n    quote](#block-quotes)"
+- " with contents *Bs*"
 - ", then the result of deleting\n    the initial [block quote marker]"
 - " from one or\n    "
 - "more lines in which the next character other than a space or tab after the\n    "
@@ -2904,9 +2911,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md
 - " is text\n    "
 - "that will be parsed as part of the content of a paragraph, but does\n    "
 - "not occur at the beginning of the paragraph.\n\n"
-- 3.  **Consecutiveness.
-- "**  A document cannot contain two [block\n    quotes]"
-- " in a row unless there is a [blank line] between them.\n\n"
+- "3.  "
+- "**Consecutiveness.**"
+- "  A document cannot contain two [block\n    quotes] in a row unless there is a "
+- "[blank line] between them.\n\n"
 - "Nothing else counts as a [block quote](#block-quotes).\n"
 - "\nHere is a simple example:\n"
 - "\n"
@@ -2949,8 +2957,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
 - "
\n" - "````````````````" - "````````````````\n\n\n" -- "The Laziness clause allows us to omit the `>` before\n" -- "[paragraph continuation text]:\n\n" +- "The Laziness clause allows us to omit the `>`" +- " before\n[paragraph continuation text]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3028,9 +3036,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "To see why, note that in\n" - "\n```markdown\n> foo\n> - bar\n```" -- "\n\n" -- "the `- bar` is indented too far to start a list, " -- "and can't\n" +- "\n\nthe `- bar`" +- " is indented too far to start a list, and can't\n" - "be an indented code block because indented code blocks cannot\n" - "interrupt paragraphs, so it is [paragraph continuation text].\n\n" - "A block quote can be empty:\n" @@ -3067,9 +3074,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "(Most current Markdown implementations, including John Gruber's\n" -- "original `Markdown.pl`, will parse this example as a " -- "single block quote\n" +- "(Most current Markdown implementations, including John Gruber's\noriginal " +- "`Markdown.pl`, will parse this example as a single " +- "block quote\n" - "with two paragraphs. But it seems better to allow the author to decide\n" - "whether two block quotes or one are wanted.)\n\n" - "Consecutiveness means that if we put these block quotes together,\n" @@ -3138,8 +3145,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

baz

\n" - "````````````````" - "````````````````\n\n\n" -- "It is a consequence of the Laziness rule that any number\n" -- "of initial `>`" +- "It is a consequence of the Laziness rule that any number\nof initial " +- "`>`" - "s may be omitted on a continuation line of a\nnested block quote:\n\n" - "````````````````" - "```````````````` " @@ -3161,8 +3168,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "````````````````" - "````````````````\n\n\n" -- "When including an indented code block in a block quote,\n" -- "remember that the [block quote marker] includes\nboth the `>`" +- "When including an indented code block in a block quote,\nremember that the " +- "[block quote marker] includes\nboth the `>`" - " and a following space of indentation. So *five spaces*" - " are needed\nafter the `>`:\n\n" - "````````````````" @@ -3175,22 +3182,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## List items\n\n" -- "A [list marker](@) is a\n" -- "[bullet list marker] or an [ordered list marker].\n\n" -- "A [bullet list marker](@)\n" -- "is a `-`, `+`, or `*` character.\n\n" -- "An [ordered list marker](@)\n" -- "is a sequence of 1--9 arabic digits (`0-" -- "9`), followed by either a\n`.` character or a " -- "`)`" +- "A [list marker](@)" +- " is a\n[bullet list marker] or an [ordered list marker].\n\n" +- "A [bullet list marker](@)\nis a `-`, " +- "`+`, or `*` character.\n\n" +- "An [ordered list marker](@)" +- "\nis a sequence of 1--9 arabic digits (" +- "`0-9`), followed by either a\n`.`" +- " character or a `)`" - " character. (The reason for the length\n" - "limit is that with 10 digits we start seeing integer overflows\n" - "in some browsers.)\n\nThe following rules define [list items]:\n\n" -- 1. **Basic case. -- "** If a sequence of lines *Ls* constitute a sequence of\n blocks " -- "*Bs* starting with a character other than a space or tab, " -- "and *M* is\n a list marker of width *W*" -- " followed by 1 ≤ *N*" +- "1. " +- "**Basic case.** If a sequence of lines *Ls*" +- " constitute a sequence of\n blocks *Bs*" +- " starting with a character other than a space or tab, and *M* " +- "is\n a list marker of width *W* followed by 1 ≤ *N*" - " ≤ 4 spaces of indentation,\n then the result of prepending " - "*M* and the following spaces to the first line\n of *Ls*" - ", and indenting subsequent lines of *Ls* by *W + " @@ -3199,14 +3206,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "(bullet or ordered) is determined by the type of its list marker.\n " - "If the list item is ordered, then it is also assigned a start\n " - "number, based on the ordered list marker.\n\n Exceptions:\n\n " -- "1. When the first list item in a [list] interrupts\n" -- " a paragraph---that is, when it starts on a line that would\n " +- "1. When the first list item in a [list] interrupts\n a paragraph" +- "---that is, when it starts on a line that would\n " - "otherwise count as [paragraph continuation text]---then (a)\n " - "the lines *Ls* must not begin with a blank line, and (" - "b) if\n the list item is ordered, the start number must be 1.\n " -- "2. " -- "If any line is a [thematic break][thematic breaks] then\n " -- "that line is not a list item.\n\n" +- "2. If any line is a [thematic break][thematic breaks" +- "] then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n" - "\n" - "````````````````" @@ -3297,11 +3303,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "Here `two` occurs in the same column as the list marker `1." -- "`,\nbut is actually contained in the list item, because there is\n" -- "sufficient indentation after the last containing blockquote marker.\n\n" -- "The converse is also possible. " -- "In the following example, the word `two`" +- "Here `two` occurs in the same column as the list marker " +- "`1.`" +- ",\nbut is actually contained in the list item, because there is\n" +- sufficient indentation after the last containing blockquote marker. +- "\n\nThe converse is also possible. In the following example, the word " +- "`two`" - "\noccurs far to the right of the initial text of the list item, " - "`one`" - ", but\n" @@ -3403,9 +3410,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "-1. not ok\n.\n" - "

-1. not ok

\n" - "````````````````" -- "````````````````\n\n\n\n" -- 2. **Item starting with indented code. -- "** If a sequence of lines *Ls*\n constitute a sequence of blocks " +- "````````````````\n\n" +- "\n\n2. **Item starting with indented code.**" +- " If a sequence of lines *Ls*\n constitute a sequence of blocks " - "*Bs* starting with an indented code\n block, and " - "*M* is a list marker of width *W*" - " followed by\n one space of indentation, then the result of prepending " @@ -3511,9 +3518,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

bar

\n\n" - "\n" - "````````````````" -- "````````````````\n\n\n" -- 3. **Item starting with a blank line. -- "** If a sequence of lines *Ls*" +- "````````````````\n\n" +- "\n3. **Item starting with a blank line.**" +- " If a sequence of lines *Ls*" - "\n starting with a single [blank line] constitute a (possibly empty)\n " - "sequence of blocks *Bs*, and *M* is a list marker " - "of width *W*,\n then the result of prepending *M*" @@ -3609,9 +3616,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n*\n\nfoo\n1.\n.\n

foo\n" - "*

\n

foo\n1.

\n" - "````````````````" -- "````````````````\n\n\n" -- 4. **Indentation. -- "** If a sequence of lines *Ls*" +- "````````````````\n\n" +- "\n4. **Indentation.** If a sequence of lines " +- "*Ls*" - " constitutes a list item\n " - "according to rule #1, #2, or #3, then the result " - "of preceding each line\n of *Ls*" @@ -3673,10 +3680,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " with two lines.\n\n indented code\n\n > A block quote.\n" - "
\n" - "````````````````" -- "````````````````\n\n\n\n" -- 5. **Laziness. -- "** If a string of lines *Ls* constitute a " -- "[list\n item](#list-items) with contents *Bs*" +- "````````````````\n\n" +- "\n\n5. **Laziness.** If a string of lines " +- "*Ls* constitute a [list\n item](#list-items)" +- " with contents *Bs*" - ", then the result of deleting\n " - "some or all of the indentation from one or more lines in which the\n " - "next character other than a space or tab after the indentation is\n [" @@ -3731,14 +3738,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "\n" - "````````````````" -- "````````````````\n\n\n\n" -- "6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items)" -- ".\n\n" -- "The rules for sublists follow from the general rules\n" -- "[above][List items]" -- ". A sublist must be indented the same number\n" +- "````````````````\n\n" +- "\n\n6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items).\n\n" +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number\n" - "of spaces of indentation a paragraph would need to be in order to be " - "included\nin the list item.\n\n" - "So, in this case we need two spaces indent:\n" @@ -3823,28 +3828,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "### Motivation\n\n" - "John Gruber's Markdown spec says the following about list items" - ":\n\n" -- "1. " -- "\"List markers typically start at the left margin, but may be indented\n" -- " by up to three spaces. List markers must be followed by one or more\n " +- "1. \"" +- "List markers typically start at the left margin, but may be indented\n " +- "by up to three spaces. List markers must be followed by one or more\n " - "spaces or a tab.\"\n\n" -- "2. " -- "\"To make lists look nice, you can wrap items with hanging indents" -- "....\n But if you don't want to, you don'" +- "2. \"" +- "To make lists look nice, you can wrap items with hanging indents." +- "...\n But if you don't want to, you don'" - "t have to.\"\n\n" -- "3. \"List items may consist of multiple paragraphs. Each subsequent\n" -- " paragraph in a list item must be indented by either 4 spaces or one\n " +- "3. \"List items may consist of multiple paragraphs. Each subsequent\n " +- "paragraph in a list item must be indented by either 4 spaces or one\n " - "tab.\"\n\n" -- "4. " -- "\"It looks nice if you indent every line of the subsequent paragraphs,\n" -- " but here again, Markdown will allow you to be lazy.\"\n\n" -- "5. " -- "\"To put a blockquote within a list item, the " +- "4. \"" +- "It looks nice if you indent every line of the subsequent paragraphs,\n " +- "but here again, Markdown will allow you to be lazy.\"" +- "\n\n5. \"" +- "To put a blockquote within a list item, the " - "blockquote's `>`" - "\n delimiters need to be indented.\"\n\n" -- "6. " -- "\"To put a code block within a list item, the code block needs to " -- "be\n indented twice — 8 spaces or two tabs.\"\n\n" -- "These rules specify that a paragraph under a list item must be indented\n" +- "6. \"" +- "To put a code block within a list item, the code block needs to be\n " +- "indented twice — 8 spaces or two tabs.\"" +- "\n\nThese rules specify that a paragraph under a list item must be indented\n" - "four spaces (presumably, from the left margin, rather than the start of\n" - "the list marker, but this is not said), and that code under a " - "list item\n" @@ -3856,8 +3861,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " block elements under a list item, including other\n" - "lists, must be indented four spaces. This principle has been called the\n" - "*four-space rule*.\n\n" -- "The four-space rule is clear and principled, and if the reference\n" -- "implementation `Markdown.pl`" +- "The four-space rule is clear and principled, and if the reference\nimplementation " +- "`Markdown.pl`" - " had followed it, it probably would have\nbecome the standard. However, " - "`Markdown.pl`" - " allowed paragraphs and\n" @@ -3872,8 +3877,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "space\n" - "rule, while discount, redcarpet, marked, PHP Markdown, " - "and others\nfollowed `Markdown.pl`" -- "'s behavior more closely.)\n\n" -- "Unfortunately, given the divergences between implementations, there\n" +- "'s behavior more closely.)" +- "\n\nUnfortunately, given the divergences between implementations, there\n" - "is no way to give a spec for list items that will be guaranteed not\n" - "to break any existing documents. However, the spec given here should\n" - "correctly handle lists formatted with either the four-space rule or\n" @@ -3905,8 +3910,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • baz
  • \n\n" - "\n\n```\n\n" - "The choice of four spaces is arbitrary. It can be learned, but it is\n" -- "not likely to be guessed, and it trips up beginners regularly.\n\n" -- "Would it help to adopt a two-space rule? The problem is that such\n" +- "not likely to be guessed, and it trips up beginners regularly." +- "\n\nWould it help to adopt a two-space rule? The problem is that such\n" - "a rule, together with the rule allowing up to three spaces of indentation " - "for\nthe initial list marker, allows text that is indented " - "*less than*" @@ -3933,19 +3938,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`bar`\nis not indented as far as the first paragraph " - "`foo`:\n\n" - "``` markdown\n 10. foo\n\n bar \n```" -- "\n\n" -- "Arguably this text does read like a list item with `bar` " -- "as a subparagraph,\n" +- "\n\nArguably this text does read like a list item with `bar`" +- " as a subparagraph,\n" - "which may count in favor of the proposal. " - "However, on this proposal indented\n" - "code would have to be indented six spaces after the list marker. " - "And this\nwould break a lot of existing Markdown, which has the pattern:\n\n" - "``` markdown\n1. foo\n\n indented code\n```" -- "\n\nwhere the code is indented eight spaces. " +- "\n\n" +- "where the code is indented eight spaces. " - "The spec above, by contrast, will\n" - "parse this text as expected, since the code block's indentation " - "is measured\nfrom the beginning of `foo`.\n\n" -- "The one case that needs special treatment is a list item that *starts*\n" +- The one case that needs special treatment is a list item that *starts* +- "\n" - "with indented code. " - "How much indentation is required in that case, since\nwe don'" - "t have a \"first paragraph\"" @@ -3956,25 +3962,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "four-space rule in cases where the list marker plus its initial indentation\n" - "takes four spaces (a common case), but diverge in other cases.\n\n" - "## Lists\n\n" -- "A [list](@) is a sequence of one or more\n" -- "list items [of the same type]. The list items\n" -- "may be separated by any number of blank lines.\n\n" -- "Two list items are [of the same type](@)\n" -- "if they begin with a [list marker] of the same type.\n" +- "A [list](@)" +- " is a sequence of one or more\nlist items [of the same type]" +- ". The list items\nmay be separated by any number of blank lines." +- "\n\nTwo list items are [of the same type](@)" +- "\nif they begin with a [list marker] of the same type.\n" - "Two list markers are of the\n" - "same type if (a) they are bullet list markers using the same character\n(" - "`-`, `+`, or `*`) or (b" - ") they are ordered list numbers with the same\ndelimiter (either " - "`.` or `)`).\n\n" -- "A list is an [ordered list](@)\n" -- "if its constituent list items begin with\n[ordered list markers], and a\n" +- "A list is an [ordered list](@)" +- "\nif its constituent list items begin with\n[ordered list markers], and a\n" - "[bullet list](@)" - " if its constituent list\nitems begin with [bullet list markers].\n\n" -- "The [start number](@)\n" -- "of an [ordered list] is determined by the list number of\n" +- "The [start number](@)" +- "\nof an [ordered list] is determined by the list number of\n" - "its initial list item. The numbers of subsequent list items are\n" - "disregarded.\n\n" -- "A list is [loose](@) if any of its constituent\n" +- "A list is [loose](@)" +- " if any of its constituent\n" - "list items are separated by blank lines, or if any of its constituent\n" - "list items directly contain two block-level elements with a blank line\n" - "between them. Otherwise a list is [tight](@)" @@ -4027,8 +4034,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to start lists without blank lines:\n\n" - "``` markdown\nI need to buy\n- new shoes\n- a coat\n" - "- a plane ticket\n```\n\nSecond, we are attracted to a\n\n" -- "> [principle of uniformity](@):\n" -- "> if a chunk of text has a certain\n> " +- "> " +- "[principle of uniformity](@)" +- ":\n> if a chunk of text has a certain\n> " - "meaning, it will continue to have the same meaning when put into a\n> " - "container block (such as a list item or blockquote).\n\n" - "(Indeed, the spec for [list items] and [block quotes] " @@ -4048,8 +4056,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[reStructuredText](https://" - "docutils.sourceforge.net/rst.html)\n" - "takes a different approach, requiring blank lines before lists\n" -- "even inside other list items.)\n\n" -- "In order to solve the problem of unwanted lists in paragraphs with\n" +- even inside other list items.) +- "\n\nIn order to solve the problem of unwanted lists in paragraphs with\n" - "hard-wrapped numerals, we allow only lists starting with `1` " - "to\ninterrupt paragraphs. Thus,\n\n" - "````````````````" @@ -4350,16 +4358,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`hi` is parsed as code, leaving the backtick at the end " - "as a literal\nbacktick.\n\n\n\n" - "## Code spans\n\n" -- "A [backtick string](@)\n" -- "is a string of one or more backtick characters (`` ` ``" +- "A [backtick string](@)" +- "\nis a string of one or more backtick characters (`` ` ``" - ") that is neither\npreceded nor followed by a backtick.\n\n" - "A [code span](@) begins with a backtick string and ends " - "with\na backtick string of equal length. The contents of the code span are\n" - "the characters between these two backtick strings, normalized in the\nfollowing ways:\n\n" - "- First, [line endings] are converted to [spaces].\n" -- "- If the resulting string both begins *and* ends with a [space]\n" -- " character, but does not consist entirely of [space]\n characters, a single [" -- "space] character is removed from the\n " +- "- If the resulting string both begins *and*" +- " ends with a [space]\n character, but does not consist entirely of [space" +- "]\n characters, a single [space] character is removed from the\n " - "front and back. This allows you to include code that begins\n " - "or ends with backtick characters, which must be separated by\n " - "whitespace from the opening or closing backtick strings.\n\n" @@ -4454,8 +4462,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/p>\n" - "````````````````" - "````````````````\n\n" -- "Note that browsers will typically collapse consecutive spaces\n" -- "when rendering ``" +- "Note that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:\n\n " - "code{white-space: pre-wrap;}\n" - "\n\nNote that backslash escapes do not work in code spans. All backslashes\n" @@ -4468,9 +4475,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/p>\n" - "````````````````" - "````````````````\n\n\n" -- "Backslash escapes are never needed, because one can always choose a\n" -- "string of *n* backtick characters as delimiters, where the " -- "code does\nnot contain any strings of exactly *n* backtick characters.\n\n" +- "Backslash escapes are never needed, because one can always choose a\nstring of " +- "*n*" +- " backtick characters as delimiters, where the code does\n" +- "not contain any strings of exactly *n* backtick characters.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4580,17 +4588,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Emphasis and strong emphasis\n\n" -- "John Gruber's original [Markdown syntax\n" -- "description](https://daringfireball.net/projects/" -- "markdown/syntax#em) says:\n\n" +- "John Gruber's original " +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown" +- "/syntax#em) says:\n\n" - "> Markdown treats asterisks (`*`) and " - "underscores (`_`" - ") as indicators of\n> emphasis. Text wrapped with one `*` or " - "`_` will be wrapped with an HTML\n> " - "`` tag; double `*`'s or `_" - "`'s will be wrapped with an HTML ``" -- "\n> tag.\n\n" -- "This is enough for most users, but these rules leave much undecided,\n" +- "\n> tag." +- "\n\nThis is enough for most users, but these rules leave much undecided,\n" - "especially when it comes to nested emphasis. The original\n" - "`Markdown.pl` test suite makes it clear that triple `*" - "**` and\n`___`" @@ -4606,34 +4615,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "entries):\n\n" - "``` markdown\n*emph *with emph* in it*\n" - "**strong **with strong** in it**\n```\n\n" -- "Many implementations have also restricted intraword emphasis to\n" -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - " forms, to avoid unwanted emphasis in words containing\n" - "internal underscores. (It is best practice to put these in code\n" - "spans, but users often do not.)\n\n" - "``` markdown\ninternal emphasis: foo*bar*baz\n" - "no emphasis: foo_bar_baz\n```\n\n" - "The rules given below capture all of these patterns, while allowing\n" -- "for efficient parsing strategies that do not backtrack.\n\n" -- "First, some definitions. " -- "A [delimiter run](@)" +- for efficient parsing strategies that do not backtrack. +- "\n\nFirst, some definitions. A [delimiter run](@)" - " is either\na sequence of one or more `*`" - " characters that is not preceded or\nfollowed by a non-backslash-escaped " - "`*` character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped " - "`_` character.\n\n" -- "A [left-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not followed by [" -- "Unicode whitespace],\n" +- "A [left-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not followed by " +- "[Unicode whitespace],\n" - "and either (2a) not followed by a [Unicode " - "punctuation character], or\n(2b) followed by a [" - "Unicode punctuation character] and\npreceded by [" - "Unicode whitespace] or a [Unicode punctuation " - "character].\nFor purposes of this definition, the beginning and the end of\n" - "the line count as Unicode whitespace.\n\n" -- "A [right-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not preceded by [" -- "Unicode whitespace],\n" +- "A [right-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not preceded by " +- "[Unicode whitespace],\n" - "and either (2a) not preceded by a [Unicode " - "punctuation character], or\n(2b) preceded by a [" - "Unicode punctuation character] and\nfollowed by [" @@ -4664,58 +4671,66 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " and its rules for distinguishing left- and right-flanking runs\n" - "are a bit more complex than the ones given here.)\n\n" - "The following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n" -- " iff (if and only if) it is part of a [left-" -- "flanking delimiter run].\n\n" -- "2. A single `_` character [can open emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " +- "1. " +- "A single `*` character [can open emphasis](@)" +- "\n iff (if and only if) it is part of a [" +- "left-flanking delimiter run].\n\n" +- "2. " +- "A single `_`" +- " character [can open emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]\n " - "and either (a) not part of a [right-flanking " - "delimiter run]\n or (b) part of a [" - "right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character].\n\n" -- "3. A single `*` character [can close emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run" +- "3. " +- "A single `*` character [can close emphasis](@)" +- "\n iff it is part of a [right-flanking delimiter run" - "].\n\n" -- "4. A single `_` character [can close emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " +- "4. " +- "A single `_`" +- " character [can close emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]\n " - "and either (a) not part of a [left-flanking " - "delimiter run]\n or (b) part of a [" - "left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character].\n\n" - "5. " -- "A double `**` [can open strong emphasis](@)\n" -- " iff it is part of a [left-flanking delimiter run" +- "A double `**` [can open strong emphasis](@)" +- "\n iff it is part of a [left-flanking delimiter run" - "].\n\n" -- "6. A double `__` [can open strong emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " +- "6. " +- "A double `__`" +- " [can open strong emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]\n " - "and either (a) not part of a [right-flanking " - "delimiter run]\n or (b) part of a [" - "right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character].\n\n" - "7. " -- "A double `**` [can close strong emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run" +- "A double `**` [can close strong emphasis](@)" +- "\n iff it is part of a [right-flanking delimiter run" - "].\n\n" -- "8. A double `__` [can close strong emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " +- "8. " +- "A double `__`" +- " [can close strong emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]\n " - "and either (a) not part of a [left-flanking " - "delimiter run]\n or (b) part of a [" - "left-flanking delimiter run]\n followed by a [" -- "Unicode punctuation character].\n\n" -- "9. " -- "Emphasis begins with a delimiter that [can open emphasis] and " -- "ends\n with a delimiter that [can close emphasis]" +- "Unicode punctuation character]." +- "\n\n9. Emphasis begins with a delimiter that [can open emphasis" +- "] and ends\n with a delimiter that [can close emphasis]" - ", and that uses the same\n character (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs" - "]. If one of the delimiters can both\n " - "open and close emphasis, then the sum of the lengths of the\n " - "delimiter runs containing the opening and closing delimiters\n " -- "must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n" -- "10. Strong emphasis begins with a delimiter that\n" -- " [can open strong emphasis] and ends with a delimiter that\n [" -- "can close strong emphasis], and that uses the same character\n (`_`" -- " or `*`" +- "must not be a multiple of 3 unless both lengths are\n multiples of 3." +- "\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis" +- "] and ends with a delimiter that\n [can close strong emphasis]" +- ", and that uses the same character\n (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs" - "]. If one of the delimiters can both open\n " @@ -4723,40 +4738,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the delimiter runs containing the opening and closing\n " - "delimiters must not be a multiple of 3 unless both lengths\n " - "are multiples of 3.\n\n" -- "11. A literal `*` character cannot occur at the beginning or end of\n" -- " `*`-delimited emphasis or `**`-" -- "delimited strong emphasis, unless it\n is backslash-escaped.\n\n" -- "12. A literal `_` character cannot occur at the beginning or end of\n" -- " `_`-delimited emphasis or `__`-" +- "11. " +- "A literal `*` character cannot occur at the beginning or end of\n " +- "`*`-delimited emphasis or `**`-" +- "delimited strong emphasis, unless it\n is backslash-escaped." +- "\n\n12. " +- "A literal `_` character cannot occur at the beginning or end of\n " +- "`_`-delimited emphasis or `__`-" - "delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "Where rules 1--12 above are compatible with multiple parsings,\n" -- "the following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. " -- "Thus, for example,\n an interpretation " -- "`...` is always preferred to\n " +- "the following principles resolve ambiguity:" +- "\n\n13. " +- "The number of nestings should be minimized. Thus, for example,\n " +- "an interpretation `...` is always " +- "preferred to\n " - "`...`.\n\n" - "14. " - "An interpretation `..." - "` is always\n preferred to " - "`...`.\n\n" -- "15. When two potential emphasis or strong emphasis spans overlap,\n" -- " so that the second begins before the first ends and ends after\n " +- "strong>`." +- "\n\n15. When two potential emphasis or strong emphasis spans overlap,\n " +- "so that the second begins before the first ends and ends after\n " - "the first ends, the first takes precedence. Thus, for example,\n " - "`*foo _bar* baz_` is parsed as `" - "foo _bar baz_` rather\n " - "than `*foo bar* baz" -- "`.\n\n" -- "16. When there are two potential emphasis or strong emphasis spans\n" -- " with the same closing delimiter, the shorter one (the one that\n " +- "`." +- "\n\n16. When there are two potential emphasis or strong emphasis spans\n " +- "with the same closing delimiter, the shorter one (the one that\n " - "opens later) takes precedence. Thus, for example,\n " - "`**foo **bar baz**` is parsed " - "as `**foo bar baz" - "`\n rather than " - "`foo **bar baz`" -- ".\n\n" -- "17. " +- "." +- "\n\n17. " - "Inline code spans, links, images, and HTML tags group more " - "tightly\n than emphasis. So, when there is a choice between an interpretation\n " - "that contains one of these elements and one that does not, the\n " @@ -4775,9 +4793,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">\n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `*` is followed by\n" -- "whitespace, and hence not part of a [left-flanking " -- "delimiter run]:\n\n" +- "This is not emphasis, because the opening `*`" +- " is followed by\nwhitespace, and hence not part of a [" +- "left-flanking delimiter run]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4785,7 +4803,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a * foo bar*

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `*` is preceded\n" +- "This is not emphasis, because the opening `*`" +- " is preceded\n" - "by an alphanumeric and followed by punctuation, and hence\n" - "not part of a [left-flanking delimiter run]:\n\n" - "````````````````" @@ -4843,8 +4862,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">\n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `_` is followed by\n" -- "whitespace:\n\n" +- "This is not emphasis, because the opening `_`" +- " is followed by\nwhitespace:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4852,8 +4871,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _ foo bar_

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `_` is preceded\n" -- "by an alphanumeric and followed by punctuation:\n\n" +- "This is not emphasis, because the opening `_`" +- " is preceded\nby an alphanumeric and followed by punctuation:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4888,7 +4907,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "тся_

    \n" - "````````````````" - "````````````````\n\n\n" -- "Here `_` does not generate emphasis, because the first delimiter run\n" +- "Here `_`" +- " does not generate emphasis, because the first delimiter run\n" - "is right-flanking and the second left-flanking:\n\n" - "````````````````" - "```````````````` " @@ -4918,8 +4938,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "_foo*\n.\n

    _foo*

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the closing `*` is preceded by\n" -- "whitespace:\n\n" +- "This is not emphasis, because the closing `*`" +- " is preceded by\nwhitespace:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4936,8 +4956,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the second `*` is\n" -- "preceded by punctuation and followed by an alphanumeric\n" +- "This is not emphasis, because the second `*`" +- " is\npreceded by punctuation and followed by an alphanumeric\n" - "(hence it is not part of a [right-flanking delimiter " - "run]:\n\n" - "````````````````" @@ -4968,8 +4988,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "Rule 4:\n" -- "\nThis is not emphasis, because the closing `_` is preceded by\n" -- "whitespace:\n\n" +- "\nThis is not emphasis, because the closing `_`" +- " is preceded by\nwhitespace:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4977,8 +4997,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _foo bar _

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the second `_` is\n" -- "preceded by punctuation and followed by an alphanumeric:\n\n" +- "This is not emphasis, because the second `_`" +- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5053,7 +5073,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ** foo bar**

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not strong emphasis, because the opening `**` is preceded\n" +- "This is not strong emphasis, because the opening `**`" +- " is preceded\n" - "by an alphanumeric and followed by punctuation, and hence\n" - "not part of a [left-flanking delimiter run]:\n\n" - "````````````````" @@ -5101,8 +5122,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo bar__

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not strong emphasis, because the opening `__` is preceded\n" -- "by an alphanumeric and followed by punctuation:\n\n" +- "This is not strong emphasis, because the opening `__`" +- " is preceded\nby an alphanumeric and followed by punctuation:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5168,8 +5189,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "(Nor can it be interpreted as an emphasized `*foo bar *`" - ", because of\nRule 11.)\n\n" -- "This is not strong emphasis, because the second `**` is\n" -- "preceded by punctuation and followed by an alphanumeric:\n\n" +- "This is not strong emphasis, because the second `**`" +- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5229,8 +5250,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    __foo bar __

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not strong emphasis, because the second `__` is\n" -- "preceded by punctuation and followed by an alphanumeric:\n\n" +- "This is not strong emphasis, because the second `__`" +- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5359,8 +5380,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n``` markdown\n" - "

    foobar" - "baz\n```\n\n\n" -- "is precluded by the condition that a delimiter that\n" +- ">\n```\n\n\nis precluded by the condition that a delimiter that\n" - "can both open and close (like the `*` after `foo`" - ")\ncannot form emphasis if the sum of the lengths of\n" - "the delimiter runs containing the opening and\n" @@ -5402,8 +5422,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \n" - "````````````````" - "````````````````\n\n\n" -- "When the lengths of the interior closing and opening\n" -- delimiter runs are *both* +- "When the lengths of the interior closing and opening\ndelimiter runs are " +- "*both*" - " multiples of 3, though,\nthey can match to create emphasis:\n\n" - "````````````````" - "```````````````` " @@ -5987,14 +6007,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[inline links] the\n" - "destination and title are given immediately after the link text. In\n[reference links]" - " the destination and title are defined elsewhere in\nthe document.\n\n" -- "A [link text](@) consists of a sequence of zero or more\n" -- "inline elements enclosed by square brackets (`[` and `]`)" -- ". The\nfollowing rules apply:\n\n" -- "- Links may not contain other links, at any level of nesting. If\n" -- " multiple otherwise valid link definitions appear nested inside each\n " -- "other, the inner-most definition is used.\n\n" -- "- Brackets are allowed in the [link text] only if (a" -- ") they\n " +- "A [link text](@)" +- " consists of a sequence of zero or more\ninline elements enclosed by square brackets (" +- "`[` and `]`). The\nfollowing rules apply:\n\n" +- "- Links may not contain other links, at any level of nesting. If\n " +- "multiple otherwise valid link definitions appear nested inside each\n " +- "other, the inner-most definition is used." +- "\n\n- Brackets are allowed in the [link text]" +- " only if (a) they\n " - "are backslash-escaped or (b) they appear as a matched pair of " - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket " @@ -6004,13 +6024,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "than the brackets in link text. Thus, for example,\n " - "`` [foo`]` `` could not be a link text" - ", since the second `]`\n is part of a code span.\n\n" -- "- The brackets in link text bind more tightly than markers for\n" -- " [emphasis and strong emphasis]. Thus, for example, " +- "- The brackets in link text bind more tightly than markers for\n [" +- "emphasis and strong emphasis]. Thus, for example, " - "`*[foo*](url)` is a link.\n\n" - "A [link destination](@) consists of either\n\n" -- "- a sequence of zero or more characters between an opening `<` and a\n" -- " closing `>` that contains no line endings or unescaped\n `<`" -- " or `>` characters, or\n\n" +- "- " +- "a sequence of zero or more characters between an opening `<` and a\n closing " +- "`>` that contains no line endings or unescaped\n `<` or " +- "`>` characters, or\n\n" - "- a nonempty sequence of characters that does not start with `<`" - ",\n does not include [ASCII control characters][" - "ASCII control character]\n or [space]" @@ -6018,19 +6039,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "backslash-escaped or (b) they are part of a balanced pair of\n " - "unescaped parentheses.\n " - "(Implementations may impose limits on parentheses nesting to\n " -- "avoid performance issues, but at least three levels of nesting\n should be supported.)\n\n" -- "A [link title](@) consists of either\n\n" -- "- a sequence of zero or more characters between straight double-quote\n" -- " characters (`\"`), including a `\"` character only if it " -- "is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between straight single-quote\n" -- " characters (`'`), including a `'` character only if it " -- "is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between matching parentheses\n" -- " (`(...)`), including a `(` or " -- "`)` character only if it is\n backslash-escaped.\n\n" -- "Although [link titles] may span multiple lines, they may not contain\n" -- "a [blank line].\n\n" +- "avoid performance issues, but at least three levels of nesting\n should be supported.)" +- "\n\nA [link title](@) consists of either\n\n" +- "- " +- "a sequence of zero or more characters between straight double-quote\n characters (" +- "`\"`), including a `\"`" +- " character only if it is\n backslash-escaped, or" +- "\n\n- " +- "a sequence of zero or more characters between straight single-quote\n characters (" +- "`'`), including a `'`" +- " character only if it is\n backslash-escaped, or" +- "\n\n- " +- "a sequence of zero or more characters between matching parentheses\n (" +- "`(...)`), including a `(` or `" +- ")` character only if it is\n backslash-escaped.\n\n" +- "Although [link titles] may span multiple lines, they may not contain\na [" +- "blank line].\n\n" - "An [inline link](@) consists of a [link text] " - "followed immediately\nby a left parenthesis `(`" - ", an optional [link destination], an optional\n[link title]" @@ -6039,10 +6063,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "These four components may be separated by spaces, tabs, and up to one " - "line\nending.\nIf both [link destination] and [link title]" - " are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending.\n\n" -- "The link's text consists of the inlines contained\n" -- "in the [link text] (excluding the enclosing square brackets)" -- ".\nThe link'" +- " be\nseparated by spaces, tabs, and up to one line ending." +- "\n\nThe link's text consists of the inlines contained\nin the [link text" +- "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n" - "`<...>` if present, with backslash-escapes in " - "effect as described\nabove. The link'" @@ -6486,8 +6509,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo *bar

    \n" - "````````````````" - "````````````````\n\n\n" -- "Note that brackets that *aren't* part of links do not take\n" -- "precedence:\n\n" +- "Note that brackets that *aren't*" +- " part of links do not take\nprecedence:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -6531,8 +6554,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[full](#full-reference-link), [collapsed](" - "#collapsed-reference-link),\nand " - "[shortcut](#shortcut-reference-link).\n\n" -- "A [full reference link](@)\n" -- "consists of a [link text] immediately followed by a [link label]\nthat " +- "A [full reference link](@)" +- "\nconsists of a [link text] immediately followed by a [link label]\nthat " - "[matches] a [link reference definition] elsewhere in the document.\n\n" - "A [link label](@) begins with a left bracket (`[" - "`) and ends\nwith the first right bracket (`]`" @@ -6542,8 +6565,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Unescaped square bracket characters are not allowed inside the\n" - "opening and closing square brackets of [link labels]. A link\n" - "label can have at most 999 characters inside the square\nbrackets.\n\n" -- "One label [matches](@)\n" -- "another just in case their normalized forms are equal. To normalize a\n" +- "One label [matches](@)" +- "\nanother just in case their normalized forms are equal. To normalize a\n" - "label, strip off the opening and closing brackets,\nperform the " - "*Unicode case fold*" - ", strip leading and trailing\n" @@ -6564,8 +6587,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo

    \n" - "````````````````" - "````````````````\n\n\n" -- "The rules for the [link text] are the same as with\n" -- "[inline links]. Thus:\n\n" +- "The rules for the [link text] are the same as with\n[" +- "inline links]. Thus:\n\n" - "The link text may contain balanced brackets, but not unbalanced ones,\n" - "unless they are escaped:\n\n" - "````````````````" @@ -6753,8 +6776,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "a single reference link, not two shortcut reference links, as\nintended:\n\n" - "``` markdown\n[foo]\n[bar]\n\n" - "[foo]: /url1\n" -- "[bar]: /url2\n```\n\n" -- "(Note that [shortcut reference links] were introduced by Gruber\n" +- "[bar]: /url2\n```\n\n(Note that [" +- "shortcut reference links] were introduced by Gruber\n" - "himself in a beta version of `Markdown.pl`, but never " - "included\nin the official syntax description. Without shortcut reference\n" - "links, it is harmless to allow space between the link text and\n" @@ -6853,8 +6876,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]: /uri

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [collapsed reference link](@)\n" -- "consists of a [link label] that [matches] a\n[link reference definition" +- "A [collapsed reference link](@)" +- "\nconsists of a [link label] that [matches] a\n[link reference definition" - "] elsewhere in the\ndocument, followed by the string `[]`" - ".\nThe contents of the link label are parsed as inlines,\n" - "which are used as the link's text. The link'" @@ -6902,8 +6925,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo\n[]

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [shortcut reference link](@)\n" -- "consists of a [link label] that [matches] a\n[link reference definition" +- "A [shortcut reference link](@)" +- "\nconsists of a [link label] that [matches] a\n[link reference definition" - "] elsewhere in the\ndocument and is not followed by `[]`" - " or a link label.\n" - "The contents of the link label are parsed as inlines,\n" @@ -7282,8 +7305,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "=\"Foo\" title=\"title\" />

    \n" - "````````````````" - "````````````````\n\n\n" -- "If you just want a literal `!" -- "` followed by bracketed text, you can\nbackslash-escape the opening " +- "If you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening " - "`[`:\n\n" - "````````````````" - "```````````````` " @@ -7293,8 +7316,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ![foo]

    \n" - "````````````````" - "````````````````\n\n\n" -- "If you want a link after a literal `!" -- "`, backslash-escape the\n`!`:\n\n" +- "If you want a link after a literal `!`" +- ", backslash-escape the\n`!`:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7314,9 +7337,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". It is parsed as\n" - "a link to the URI, with the URI as the link's " - "label.\n\n" -- "An [absolute URI](@),\n" -- "for these purposes, consists of a [scheme] followed by a colon (" -- "`:`" +- "An [absolute URI](@)" +- ",\nfor these purposes, consists of a [scheme]" +- " followed by a colon (`:`" - ")\nfollowed by zero or more characters other than [ASCII control\ncharacters]" - "[ASCII control character], [space], `<`, " - "and `>`" @@ -7436,18 +7459,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "p>\n" - "````````````````" - "````````````````\n\n\n" -- "An [email autolink](@)\n" -- "consists of `<`, followed by an [email address],\nfollowed by " -- "`>`" +- "An [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is " - "`mailto:` followed by the email address.\n\n" -- "An [email address](@),\n" -- "for these purposes, is anything that matches\nthe " -- "[non-normative regex from the HTML5\n" -- "spec](https://" -- html.spec.whatwg.org/multipage/ -- "forms.html#e-mail-state-(type=email))" -- ":\n\n " +- "An [email address](@)" +- ",\nfor these purposes, is anything that matches\nthe " +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org" +- "/multipage/forms.html#e-mail-state-(type" +- "=email)):\n\n " - "/^[a-zA-Z0-9.!" - "#$%&'*+/=?" - "^_`{|}~-]+@[a-zA" @@ -7543,38 +7564,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Tag and attribute names are not limited to current HTML tags,\n" - "so custom tags (and even, say, DocBook tags) may be " - "used.\n\nHere is the grammar for tags:\n" -- "\nA [tag name](@) consists of an ASCII letter\n" +- "\nA [tag name](@)" +- " consists of an ASCII letter\n" - "followed by zero or more ASCII letters, digits, or\n" - "hyphens (`-`).\n\n" - "An [attribute](@) consists of spaces, tabs, and up " - "to one line ending,\nan [attribute name], and an optional\n[" - "attribute value specification].\n\n" -- "An [attribute name](@)\n" -- "consists of an ASCII letter, `_`, or `:`" -- ", followed by zero or more ASCII\nletters, digits, `_`" -- ", `.`, `:`, or `-`" +- "An [attribute name](@)\nconsists of an ASCII letter, " +- "`_`, or `:`, followed by zero or more " +- "ASCII\nletters, digits, `_`, `.`, " +- "`:`, or `-`" - ". (Note: This is the XML\n" - "specification restricted to ASCII. " - "HTML5 is laxer.)\n\n" -- "An [attribute value specification](@)\n" -- "consists of optional spaces, tabs, and up to one line ending,\na " +- "An [attribute value specification](@)" +- "\nconsists of optional spaces, tabs, and up to one line ending,\na " - "`=` character, optional spaces, tabs, and up to one line " - "ending,\nand an [attribute value].\n\n" -- "An [attribute value](@)\n" -- "consists of an [unquoted attribute value],\na [" +- "An [attribute value](@)" +- "\nconsists of an [unquoted attribute value],\na [" - "single-quoted attribute value], or a [double-quoted attribute value]" - ".\n\n" -- "An [unquoted attribute value](@)\n" -- "is a nonempty string of characters not\n" +- "An [unquoted attribute value](@)" +- "\nis a nonempty string of characters not\n" - "including spaces, tabs, line endings, `\"`, `'`" - ", `=`, `<`, `>`, or `` " - "` ``.\n\n" -- "A [single-quoted attribute value](@)\n" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`.\n\n" -- "A [double-quoted attribute value](@)\n" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`.\n\n" +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final " +- "`'`.\n\n" +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final " +- "`\"`.\n\n" - "An [open tag](@) consists of a `<` character, " - "a [tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional " @@ -7589,15 +7611,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`-->`, and `-->` (see the\n" - "[HTML spec](https://" - html.spec.whatwg.org/multipage/ -- "parsing.html#markup-declaration-open-state)).\n\n" -- "A [processing instruction](@)\nconsists of the string ``" +- "parsing.html#markup-declaration-open-state))." +- "\n\nA [processing instruction](@)\nconsists of the string ``" - ", and the string\n`?>`.\n\n" -- "A [declaration](@) consists of the string ``, and the character `>`.\n\n" -- "A [CDATA section](@) consists of\n" -- "the string ``" - ", and the string `]]>`.\n\n" - "An [HTML tag](@) consists of an [open tag" @@ -7829,8 +7851,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "baz

    \n" - "````````````````" - "````````````````\n\n\n" -- "For a more visible alternative, a backslash before the\n" -- "[line ending] may be used instead of two or more spaces:\n\n" +- "For a more visible alternative, a backslash before the\n[line ending]" +- " may be used instead of two or more spaces:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -8009,8 +8031,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "In the second phase, the raw text contents of paragraphs and headings\n" - "are parsed into sequences of Markdown inline elements (strings,\n" - "code spans, links, emphasis, and so on), using the map of " -- "link\nreferences constructed in phase 1.\n\n" -- "At each point in processing, the document is represented as a tree of\n" +- "link\nreferences constructed in phase 1." +- "\n\nAt each point in processing, the document is represented as a tree of\n" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as " - "**children**" @@ -8034,10 +8056,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Each line that is processed has an effect on this tree. The line is\n" - "analyzed and, depending on its contents, the document may be altered\n" - "in one or more of the following ways:\n\n" -- "1. One or more open blocks may be closed.\n2. " -- "One or more new blocks may be created as children of the\n last open block.\n" -- "3. Text may be added to the last (deepest) open block remaining\n" -- " on the tree.\n\n" +- 1. One or more open blocks may be closed. +- "\n2. One or more new blocks may be created as children of the\n " +- "last open block.\n" +- "3. Text may be added to the last (deepest) open block remaining\n " +- "on the tree.\n\n" - "Once a line has been incorporated into the tree in this way,\n" - "it can be discarded, so input can be read in a stream.\n\n" - "For each line, we follow this procedure:\n\n" @@ -8050,14 +8073,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "In this phase we may match all or just some of the open\n" - "blocks. " - "But we cannot close unmatched blocks yet, because we may have a\n[" -- "lazy continuation line].\n\n" -- "2. Next, after consuming the continuation markers for existing\n" +- "lazy continuation line]." +- "\n\n2. Next, after consuming the continuation markers for existing\n" - "blocks, we look for new block starts (e.g. `>` " - "for a block quote).\n" - "If we encounter a new block start, we close any blocks unmatched\n" - "in step 1 before creating the new block as a child of the last\n" -- "matched container block.\n\n" -- "3. Finally, we look at the remainder of the line (after block\n" +- matched container block. +- "\n\n3. Finally, we look at the remainder of the line (after block\n" - "markers like `>`, list markers, and indentation have been consumed" - ").\nThis is text that can be incorporated into the last open\n" - "block (a paragraph, code block, heading, or raw HTML)" @@ -8077,17 +8100,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n``` tree\n-> document\n```" - "\n\nThe first line of our text,\n" - "\n``` markdown\n> Lorem ipsum dolor\n```" -- "\n\ncauses a `block_quote` block to be created as a child of our\n" -- "open `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" +- "\n\ncauses a `block_quote`" +- " block to be created as a child of our\nopen `document`" +- " block, and a `paragraph` block as a child of\nthe " +- "`block_quote`" - ". Then the text is added to the last open\nblock, the `paragraph`" - ":\n\n" - "``` tree\n-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\"\n```\n\nThe next line,\n" - "\n``` markdown\nsit amet.\n```" -- "\n\n" -- "is a \"lazy continuation\" of the open `paragraph`, so it gets " -- "added\nto the paragraph's text:\n\n" +- "\n\nis a \"lazy continuation\" of the open `paragraph`" +- ", so it gets added\nto the paragraph's text:\n\n" - "``` tree\n-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - "```\n\nThe third line,\n" @@ -8106,9 +8129,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " \"Qui *quodsi iracundia*\"\n" - "```\n\nThe fourth line,\n" - "\n``` markdown\n> - aliquando id\n```" -- "\n\n" -- "causes the `list_item` (and its child the `paragraph`) " -- "to be closed,\nand a new `list_item`" +- "\n\ncauses the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" - " opened up as child of the `list`. A `paragraph`" - "\nis added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:\n\n" @@ -8131,10 +8153,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " list (type=bullet tight=true bullet_char=-)\n" - " list_item\n paragraph\n str \"Qui \"\n emph\n" - " str \"quodsi iracundia\"\n list_item\n paragraph\n" -- " str \"aliquando id\"\n```\n\n" -- "Notice how the [line ending] in the first paragraph has\n" -- "been parsed as a `softbreak`, and the asterisks " -- "in the first list item\nhave become an `emph`.\n\n" +- " str \"aliquando id\"\n```\n\nNotice how the " +- "[line ending] in the first paragraph has\nbeen parsed as a " +- "`softbreak`" +- ", and the asterisks in the first list item\nhave become an " +- "`emph`.\n\n" - "### An algorithm for parsing nested emphasis and links\n\n" - "By far the trickiest part of inline parsing is handling emphasis,\n" - "strong emphasis, links, and images. This is done using the following\nalgorithm.\n\n" @@ -8145,24 +8168,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "add a pointer to this text node to the [delimiter stack]" - "(@).\n\n" - "The [delimiter stack] is a doubly linked list. Each\n" -- "element contains a pointer to a text node, plus information about\n\n" -- "- the type of delimiter (`[`, `![" +- "element contains a pointer to a text node, plus information about" +- "\n\n- the type of delimiter (`[`, `![" - "`, `*`, `_`)\n" - "- the number of delimiters,\n" - "- whether the delimiter is \"active\" (all are active to start" - "), and\n" -- "- whether the delimiter is a potential opener, a potential closer,\n" -- " or both (which depends on what sort of characters precede\n " +- "- whether the delimiter is a potential opener, a potential closer,\n " +- "or both (which depends on what sort of characters precede\n " - "and follow the delimiters).\n\n" - "When we hit a `]` character, we call the *look for link " -- "or image*\nprocedure (see below).\n\n" -- "When we hit the end of the input, we call the *process emphasis*\n" -- "procedure (see below), with `stack_bottom` = NULL" -- ".\n\n" +- "or image*\nprocedure (see below)." +- "\n\nWhen we hit the end of the input, we call the *process emphasis*" +- "\nprocedure (see below), with `stack_bottom`" +- " = NULL.\n\n" - "#### *look for link or image*\n\n" - "Starting at the top of the delimiter stack, we look backwards\n" - "through the stack for an opening `[` or `![`" -- " delimiter.\n\n" +- " delimiter." +- "\n\n" - "- If we don't find one, we return a literal text node `" - "]`.\n\n" - "- If we do find one, but it's not *active*, " @@ -8172,12 +8196,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- If we find one and it's active, then we parse ahead " - "to see if\n " - "we have an inline link/image, reference link/image, collapsed reference\n " -- "link/image, or shortcut reference link/image.\n\n " -- "+ If we don't, then we remove the opening delimiter from " -- "the\n delimiter stack and return a literal text node `]`.\n\n " -- "+ If we do, then\n\n" -- " * We return a link or image node whose children are the inlines\n" -- " after the text node pointed to by the opening delimiter.\n\n " +- "link/image, or shortcut reference link/image." +- "\n\n + If we don'" +- "t, then we remove the opening delimiter from the\n " +- "delimiter stack and return a literal text node `]`.\n\n " +- "+ If we do, then\n" +- "\n * We return a link or image node whose children are the inlines\n " +- "after the text node pointed to by the opening delimiter.\n\n " - "* We run *process emphasis* on these inlines, with the `[" - "` opener\n as `stack_bottom`.\n\n " - "* We remove the opening delimiter.\n\n" @@ -8185,17 +8210,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "all\n `[` delimiters before the opening delimiter to " - "*inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\n" -- "Parameter `stack_bottom` sets a lower bound to how far we\n" -- "descend in the [delimiter stack]" -- ". If it is NULL, we can\n" +- "Parameter `stack_bottom`" +- " sets a lower bound to how far we\ndescend in the [delimiter stack" +- "]. If it is NULL, we can\n" - "go all the way to the bottom. Otherwise, we stop before\nvisiting " - "`stack_bottom`.\n\n" - "Let `current_position` point to the element on the [delimiter " - "stack]\njust above `stack_bottom` (or the first element if " - "`stack_bottom`\nis NULL).\n\n" -- "We keep track of the `openers_bottom` for each delimiter\n" -- "type (`*`, `_`), indexed to the length " -- "of the closing delimiter run\n" +- "We keep track of the `openers_bottom`" +- " for each delimiter\ntype (`*`, `_`" +- "), indexed to the length of the closing delimiter run\n" - "(modulo 3) and to whether the closing delimiter can also " - "be an\nopener. Initialize this to `stack_bottom`.\n\n" - "Then we repeat the following until we run out of potential\nclosers:\n\n" @@ -8203,31 +8228,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "needed)\n until we find the first potential closer with delimiter `*`" - " or `_`" - ".\n (This will be the potential closer closest\n to the beginning of the input " -- "-- the first one in parse order.)\n\n" -- "- Now, look back in the stack (staying above `stack_bottom` " -- "and\n the `openers_bottom`" +- "-- the first one in parse order.)" +- "\n\n- " +- "Now, look back in the stack (staying above `stack_bottom`" +- " and\n the `openers_bottom`" - " for this delimiter type) for the\n first matching potential opener (\"matching" - "\" means same delimiter).\n\n- If one is found:\n\n " -- "+ Figure out whether we have emphasis or strong emphasis:\n" -- " if both closer and opener spans have length >= 2, we have\n " +- "+ Figure out whether we have emphasis or strong emphasis:\n " +- "if both closer and opener spans have length >= 2, we have\n " - "strong, otherwise regular.\n\n " -- "+ Insert an emph or strong emph node accordingly, after\n" -- " the text node corresponding to the opener.\n\n " -- "+ Remove any delimiters between the opener and closer from\n" -- " the delimiter stack.\n\n " -- "+ Remove 1 (for regular emph) or 2 (for strong " +- "+ Insert an emph or strong emph node accordingly, after\n " +- "the text node corresponding to the opener.\n\n " +- "+ Remove any delimiters between the opener and closer from\n " +- the delimiter stack. +- "\n\n + " +- "Remove 1 (for regular emph) or 2 (for strong " - "emph) delimiters\n " - "from the opening and closing text nodes. If they become empty\n " - "as a result, remove them and remove the corresponding element\n " - "of the delimiter stack. If the closing node is removed, reset\n " - "`current_position` to the next element in the stack.\n\n" -- "- If none is found:\n\n" -- " + Set `openers_bottom` to the element before `current_position" -- "`.\n " +- "- If none is found:\n" +- "\n + " +- "Set `openers_bottom` to the element before `current_position`" +- ".\n " - "(We know that there are no openers for this kind of closer up to " -- "and\n including this point, so this puts a lower bound on future searches.)\n\n " -- "+ If the closer at `current_position` is not a potential opener,\n" -- " remove it from the delimiter stack (since we know it can't\n " +- "and\n including this point, so this puts a lower bound on future searches.)" +- "\n\n + " +- "If the closer at `current_position`" +- " is not a potential opener,\n " +- "remove it from the delimiter stack (since we know it can't\n " - "be a closer either).\n\n " - "+ Advance `current_position` to the next element in the stack.\n\n" - "After we're done, we remove all delimiters above `" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap index 8f15614..c724574 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap @@ -75,11 +75,11 @@ input_file: tests/inputs/markdown/github_flavored.md - " + Facilisis in pretium nisl aliquet\n" - " - Nulla volutpat aliquam velit\n" - "+ Very easy!\n```\n\n" -- "1. First ordered list item\n" -- "2. Another item\n⋅⋅* Unordered sub-list.\n" -- "1. " -- "Actual numbers don't matter, just that it's a number\n" -- "⋅⋅1. Ordered sub-list\n4. And another item.\n\n" +- 1. First ordered list item +- "\n2. Another item\n⋅⋅* Unordered sub-list.\n" +- "1. Actual numbers don't matter, just that it'" +- "s a number\n⋅⋅1. Ordered sub-list\n" +- "4. And another item.\n\n" - ⋅⋅⋅You can have properly indented paragraphs within list items - ". " - "Notice the blank line above, and the leading spaces (at least one, " @@ -206,7 +206,8 @@ input_file: tests/inputs/markdown/github_flavored.md - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"\n```\n\n" - "Here's our logo (hover to see the title text):\n" -- "\nInline-style:\n![" +- "\nInline-style:\n" +- "![" - "alt text](https://github.com/" - adam-p/markdown-here/raw/master/src - "/common/images/icon48.png \"Logo Title Text 1" @@ -393,19 +394,20 @@ input_file: tests/inputs/markdown/github_flavored.md - ">> ...by using additional greater-than signs right next to each " - "other...\n> > > ...or with spaces between arrows.\n" - "```\n\n" -- "> Blockquotes are very handy in email to emulate reply text.\n" -- "> This line is part of the same quote.\n\nQuote break.\n\n" +- "> Blockquotes are very handy in email to emulate reply text." +- "\n> This line is part of the same quote.\n\nQuote break.\n\n" - "> This is a very long line that will still be quoted properly when it wraps" - ". Oh boy let'" - "s keep writing to make sure this is long enough to actually wrap for everyone. " - "Oh, you can *put* **Markdown** into a " - "blockquote.\n\n" -- "> Blockquotes can also be nested...\n" -- ">> ...by using additional greater-than signs right next to each " -- "other...\n> > > ...or with spaces between arrows.\n\n" +- "> Blockquotes can also be nested...\n>" +- "> ...by using additional greater-than signs right next to each other" +- "...\n> > > ...or with spaces between arrows.\n\n" - "------\n\n" - "# Inline HTML\n\n" -- "```\n
    \n" +- "```\n" +- "
    \n" - "
    Definition list
    \n" - "
    Is something people use sometimes.
    \n\n" - "
    Markdown in HTML
    \n" @@ -420,8 +422,9 @@ input_file: tests/inputs/markdown/github_flavored.md - Use HTML tags.\n
    \n\n------\n\n" - "# Horizontal Rules\n\n" -- "```\nThree or more...\n\n---\n\nHyphens\n\n" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\n" +- "```\n" +- "Three or more...\n\n---\n\nHyphens\n\n***\n\n" +- "Asterisks\n\n___\n\nUnderscores\n```\n\n" - "Three or more...\n\n---\n\nHyphens\n\n***\n" - "\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" - "# YouTube Videos\n\n" @@ -448,7 +451,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "jpg\" alt=\"IMAGE ALT TEXT " - "HERE\" width=\"240\" height=\"180\" border=" - "\"10\">\n\n" -- "\n```\n[![" +- "\n```\n" +- "[![" - "IMAGE ALT TEXT HERE](http:/" - /img.youtube.com/vi/ - YOUTUBE_VIDEO_ID_HERE/0. @@ -456,11 +460,10 @@ input_file: tests/inputs/markdown/github_flavored.md - www.youtube.com/watch? - "v=YOUTUBE_VIDEO_ID_HERE)\n" - "```\n\n" -- "[![" -- "IMAGE ALT TEXT HERE](https:/" -- /upload.wikimedia.org/wikipedia/ -- commons/thumb/e/ef/YouTube_logo_2015. -- svg/1200px-YouTube_logo_2015. -- "svg.png)](https://" -- www.youtube.com/watch? +- "[![IMAGE ALT TEXT HERE" +- "](https://upload.wikimedia.org/" +- wikipedia/commons/thumb/e/ef/ +- YouTube_logo_2015.svg/1200px- +- "YouTube_logo_2015.svg.png)](https" +- "://www.youtube.com/watch?" - "v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap index f33309a..643c68a 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap @@ -10,11 +10,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\nThis paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:" - "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n This paragraph belongs to item two of the outer list.\n```\n\nThe AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document." - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank" -- " lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)" -- "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```" -- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```\n\nIn the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away." +- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)" +- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```" +- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```" +- "11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```" +- "In the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away." - "## About this document\n\nThis document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\nof representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer." - "Note that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\nimplementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\npercent-encode non-ASCII characters in URLs.\n\nThis document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs." - "# Preliminaries" @@ -117,10 +118,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n``` aa ```\nfoo\n.\n

    aa\nfoo

    \n````````````````````````````````\n\n\n[Info strings] for tilde code blocks can contain backticks and tildes:\n\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
    foo\n
    \n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:" - "```````````````````````````````` example\n```\n``` aaa\n```\n.\n
    ``` aaa\n
    \n````````````````````````````````" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line." -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string" -- "``." -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``." +- "5. **Start condition:** line begins with the string\n``." +- "6." +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:"
     - "```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following." - "All types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
    \n hi\n
    \n\nokay.\n.\n\n \n \n \n
    \n hi\n
    \n

    okay.

    \n````````````````````````````````" @@ -193,8 +195,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
    \n
    \n
    \n

    foo\nbar\nbaz

    \n
    \n
    \n
    \n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:" - "```````````````````````````````` example\n> code\n\n> not code\n.\n
    \n
    code\n
    \n
    \n
    \n

    not code

    \n
    \n````````````````````````````````" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2." -- "If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\nFor example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:" +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" +- " 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." +- "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:" - "```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n````````````````````````````````\n\n\nThe most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem." - "Here are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" - "```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • one
    • \n
    \n
     two\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" @@ -283,12 +286,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." - " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)" -- " iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3." - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example," -- " `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`." +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`." +- "16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`." - "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" - "This is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:" - "```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````" @@ -490,5 +493,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter." - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:" +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap index c6e1c0c..5f7f84f 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap @@ -73,12 +73,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - to read. The nesting of list items is apparent to the eye in the - "source, not just in the processed document." - "## Why is a spec needed?" -- "John Gruber's [canonical description of Markdown's" -- "syntax](https://daringfireball.net/projects/" -- markdown/syntax) +- "John Gruber's" +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown" +- /syntax) - does not specify the syntax unambiguously. Here are some examples of - "questions it does not answer:" -- 1. How much indentation is needed for a sublist? +- "1." +- How much indentation is needed for a sublist? - The spec says that - "continuation paragraphs need to be indented four spaces, but is" - not fully explicit about sublists. It is natural to think that @@ -87,9 +89,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "corner case,\" and divergences" - between implementations on this issue often lead to surprises for - users in real documents. (See -- "[this comment by John" -- "Gruber](https://web.archive.org/web" -- "/20170611172104/http://" +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/" +- "20170611172104/http://" - article.gmane.org/ - gmane.text.markdown.general/1997).) - 2. Is a blank line needed before a block quote or heading? @@ -98,17 +100,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - also to ambiguities in parsing (note that some implementations - "put the heading inside the blockquote, while others do not)." - (John Gruber has also spoken -- "[in favor of requiring the blank" -- "lines](https://web.archive.org/web/" +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ - gmane.text.markdown.general/2146).) -- 3. Is a blank line needed before an indented code block? -- "(`Markdown.pl` requires it, but this is not mentioned" -- "in the\n documentation, and some implementations do not require it.)" +- "3." +- "Is a blank line needed before an indented code block?\n (" +- "`Markdown.pl` requires it, but this is not mentioned in" +- "the\n documentation, and some implementations do not require it.)" - "``` markdown\n paragraph\n code?\n ```" -- 4. What is the exact rule for determining when list items get -- "wrapped in `

    `" +- "4." +- "What is the exact rule for determining when list items get\n wrapped in" +- "`

    `" - " tags? Can a list be partially \"loose\" and partially\n \"tight\"" - "? What should we do with a list like this?" - "``` markdown\n 1. one\n\n 2. two\n 3. three" @@ -139,8 +143,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown" - "[a backtick (`)](/url) and [another" - "backtick (`)](/url).\n ```" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis?" -- "For example, how should the following be parsed?" +- 9. What are the precedence rules for markers of emphasis and strong +- "emphasis? For example, how should the following be parsed?" - "``` markdown\n *foo *bar* baz*" - "```" - 10. What are the precedence rules between block-level and inline-level @@ -148,8 +152,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown" - "- `a long code span can contain a hyphen like this" - " - and it can screw things up`\n ```" -- 11. Can list items include section headings? -- "(`Markdown.pl`" +- "11." +- "Can list items include section headings? (`Markdown.pl`" - does not - "allow this, but does allow blockquotes to include headings.)" - "``` markdown\n - # Heading\n ```" @@ -200,8 +204,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - non-ASCII characters in URLs). - "But a conforming implementation\ncan use a different renderer and may choose not to" - percent-encode non-ASCII characters in URLs. -- "This document is generated from a text file, `spec.txt" -- "`, written" +- "This document is generated from a text file," +- "`spec.txt`" +- ", written" - in Markdown with a small extension for the side-by-side tests. - "The script `tools/makespec.py` can be used to convert" - "`spec.txt`" @@ -212,43 +217,44 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Preliminaries" - "## Characters and lines" - "Any sequence of [characters] is a valid CommonMark\ndocument." -- "A [character](@) is a Unicode code point." -- "Although some\ncode points (for example, combining accents) do not correspond to" +- "A [character](@)" +- is a Unicode code point. Although some +- "code points (for example, combining accents) do not correspond to" - "characters in an intuitive sense, all code points count as characters" - for purposes of this spec. -- This spec does not specify an encoding; it thinks of lines as composed -- "of [characters]" -- rather than bytes. A conforming parser may be limited +- "This spec does not specify an encoding; it thinks of lines as composed\nof" +- "[characters] rather than bytes. A conforming parser may be limited" - to a certain encoding. - "A [line](@) is a sequence of zero or more [characters" - "]\nother than line feed (`U+000A`" - ") or carriage return (`U+000D`" - "),\nfollowed by a [line ending] or by the end of file." -- "A [line ending](@) is a line feed (`U+" -- "000A`), a carriage return\n(`U+000D`" -- ") not followed by a line feed, or a carriage return and a" -- following line feed. -- "A line containing no characters, or a line containing only spaces" -- "(`U+0020`) or tabs (`U+" -- "0009`), is called a [blank line](@)." +- "A [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`) not followed by a line feed, or a" +- "carriage return and a\nfollowing line feed." +- "A line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (`U+0009" +- "`), is called a [blank line](@)." - "The following definitions of character classes will be used in this spec:" -- "A [Unicode whitespace character](@) is a character in" -- "the Unicode `Zs` general\ncategory, or a tab (" -- "`U+0009`), line feed (`U+000A" -- "`), form feed (`U+000C`" -- "), or\ncarriage return (`U+000D`)." +- "A [Unicode whitespace character](@)" +- "is a character in the Unicode `Zs`" +- " general\ncategory, or a tab (`U+0009`" +- "), line feed (`U+000A`), form feed (" +- "`U+000C`), or\ncarriage return (" +- "`U+000D`)." - "[Unicode whitespace](@) is a sequence of one or" - "more\n[Unicode whitespace characters]." - "A [tab](@) is `U+0009`." - "A [space](@) is `U+0020`." -- "An [ASCII control character](@) is a character between `" -- "U+0000–1F` (both\nincluding) or" +- "An [ASCII control character](@) is a character between" +- "`U+0000–1F` (both\nincluding) or" - "`U+007F`." -- "An [ASCII punctuation character](@)" -- "is `!`, `\"`, `#`, `$`" -- ", `%`, `&`, `'`, `(`" -- ", `)`,\n`*`, `+`, `,`" -- ", `-`, `.`, `/`" +- "An [ASCII punctuation character](@)\nis" +- "`!`, `\"`, `#`, `$`," +- "`%`, `&`, `'`, `(`," +- "`)`,\n`*`, `+`, `,`," +- "`-`, `.`, `/`" - " (U+0021–2F), \n`:`," - "`;`, `<`, `=`, `>`," - "`?`, `@`" @@ -312,7 +318,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "Normally the `>` that begins a block quote may be followed" +- "Normally the `>`" +- that begins a block quote may be followed - "optionally by a space, which is not considered part of the" - "content. In the following case `>`" - "is followed by a tab," @@ -511,7 +518,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- Entity and character references are not recognized in code" - blocks and code spans. - "- Entity and character references cannot stand in place of" -- " special characters that define structural elements in\n CommonMark. For example, although" +- "special characters that define structural elements in\n CommonMark. For example, although" - "`*` can be used\n in place of a literal" - "`*` character, `*` cannot replace\n `*`" - " in emphasis delimiters, bullet list markers, or thematic\n breaks." @@ -534,9 +541,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ∲ ≧̸

    - "````````````````" - "````````````````" -- "[Decimal numeric character\nreferences](@)" -- "consist of `&#` + a string of 1--7" -- "arabic digits + `;`" +- "[Decimal numeric character\nreferences](@)\nconsist of" +- "`&#` + a string of 1--7 arabic digits" +- "+ `;`" - ". A\nnumeric character reference is parsed as the corresponding" - Unicode character. - Invalid Unicode code points will be replaced by @@ -551,10 +558,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "0;\n.\n

    # Ӓ Ϡ �

    " - "````````````````" - "````````````````" -- "[Hexadecimal numeric character" -- "references](@) consist of `&#` +\neither `X`" -- "or `x` + a string of 1-6 hexadecimal digits" -- "+ `;`" +- "[Hexadecimal numeric character\nreferences](@) consist of" +- "`&#` +\neither `X` or `x`" +- "+ a string of 1-6 hexadecimal digits + `;`" - ".\nThey too are parsed as the corresponding Unicode character (this" - time specified with a hexadecimal numeral instead of decimal) - "." @@ -701,8 +707,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "# Blocks and inlines" -- We can think of a document as a sequence of -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block" - "quotations, lists, headings, rules, and code blocks." - Some blocks (like @@ -711,8 +716,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "--text," - "links, emphasized text, images, code spans, and so on." - "## Precedence" -- "Indicators of block structure always take precedence over indicators\nof inline structure." -- "So, for example, the following is a list with" +- Indicators of block structure always take precedence over indicators +- "of inline structure. So, for example, the following is a list with" - "two items, not a list with one item containing a code span:" - "````````````````" - "````````````````" @@ -1029,7 +1034,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    " - "````````````````" - "````````````````" -- "A sequence of `#` characters with anything but spaces or tabs following it" +- "A sequence of `#`" +- characters with anything but spaces or tabs following it - "is not a closing sequence, but counts as part of the contents of the" - "heading:" - "````````````````" @@ -1047,8 +1053,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo#

    " - "````````````````" - "````````````````" -- "Backslash-escaped `#` characters do not count as part" -- "of the closing sequence:" +- "Backslash-escaped `#`" +- " characters do not count as part\nof the closing sequence:" - "````````````````" - "````````````````" - example @@ -1089,7 +1095,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Setext headings" -- "A [setext heading](@) consists of one or more" +- "A [setext heading](@)" +- consists of one or more - "lines of text, not interrupted by a blank line, of which the first line" - "does not\nhave more than 3 spaces of indentation, followed by\na [" - "setext heading underline]. The lines of text must be such" @@ -1103,9 +1110,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`=` characters or a sequence of `-` characters, with no more" - than 3 - spaces of indentation and any number of trailing spaces or tabs. -- "The heading is a level 1 heading if `=` characters are used in" -- "the [setext heading underline], and a level 2 heading if" -- "`-`" +- "The heading is a level 1 heading if `=`" +- " characters are used in\nthe [setext heading underline]" +- ", and a level 2 heading if `-`" - characters are used. The contents of the heading are the result - "of parsing the preceding lines of text as CommonMark inline\ncontent." - "In general, a setext heading need not be preceded or followed by a" @@ -1232,8 +1239,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- "The setext heading underline cannot be a [lazy continuation" -- "line] in a list item or block quote:" +- "The setext heading underline cannot be a [lazy continuation\nline]" +- "in a list item or block quote:" - "````````````````" - "````````````````" - example @@ -1328,7 +1335,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    > foo

    " - "````````````````" - "````````````````" -- "**Compatibility note:** Most existing Markdown implementations" +- "**Compatibility note:**" +- Most existing Markdown implementations - do not allow the text of setext headings to span multiple lines. - But there is no consensus about how to interpret - "``` markdown\nFoo\nbar\n---\nbaz\n```" @@ -1501,14 +1509,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Fenced code blocks" -- "A [code fence](@) is a sequence" -- "of at least three consecutive backtick characters (`` ` ``" -- ") or\ntildes (`~`" +- "A [code fence](@)" +- " is a sequence\nof at least three consecutive backtick characters (" +- "`` ` ``) or\ntildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA" - "[fenced code block](@)" - "begins with a code fence, preceded by up to three spaces of indentation" -- "." -- The line with the opening code fence may optionally contain some text +- ".\n\nThe line with the opening code fence may optionally contain some text" - following the code fence; this is trimmed of leading and trailing - "spaces or tabs and called the [info string](@)" - ". If the [info string] comes" @@ -1516,8 +1523,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - characters. (The reason for this restriction is that otherwise - some inline code would be incorrectly interpreted as the - beginning of a fenced code block.) -- "The content of the code block consists of all subsequent lines, until" -- "a closing [code fence] of the same type as the code block" +- "The content of the code block consists of all subsequent lines, until\na closing [" +- "code fence] of the same type as the code block" - "began with (backticks or tildes), and with at least as" - many backticks - or tildes as the opening code fence. If the leading code fence is @@ -1813,9 +1820,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "An [HTML block](@) is a group of lines that" - is treated - as raw HTML (and will not be escaped in HTML output -- ). -- "There are seven kinds of [HTML block], which can be defined" -- "by their\nstart and end conditions. The block begins with a line that meets a" +- ").\n\nThere are seven kinds of [HTML block]" +- ", which can be defined by their" +- start and end conditions. The block begins with a line that meets a - "[start condition](@) (after up to three optional spaces of" - "indentation).\nIt ends with the first subsequent line that meets a matching" - "[end condition](@), or the last line of the document," @@ -1845,7 +1852,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "**Start condition:** line begins with the string ``." -- "5. **Start condition:** line begins with the string" +- "5." +- "**Start condition:** line begins with the string" - "``" - "." @@ -1878,15 +1886,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "**End condition:** line is followed by a [blank line]" - "." - "7." -- "**Start condition:** line begins with a complete [open tag]" -- "(with any [tag name] other than `pre`, `script`" -- ",\n`style`, or `textarea`" +- "**Start condition:**" +- " line begins with a complete [open tag]\n(with any [tag name]" +- " other than `pre`, `script`,\n`style`, or" +- "`textarea`" - ") or a complete [closing tag]," - "followed by zero or more spaces and tabs, followed by the end of the" - "line.\\\n**End condition:**" - "line is followed by a [blank line]." -- HTML blocks continue until they are closed by their appropriate -- "[end condition], or the last line of the document or other" +- "HTML blocks continue until they are closed by their appropriate\n[end condition]" +- ", or the last line of the document or other" - "[container\nblock](#container-blocks)" - ". This means any HTML" - "**within an HTML\nblock**" @@ -2090,8 +2099,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo*\n" - "````````````````" - "````````````````" -- "In this case, we get a raw HTML block that just includes" -- "the ``" +- "In this case, we get a raw HTML block that just includes\nthe" +- "``" - tag (because it ends with the following blank - "line). So the contents get interpreted as CommonMark:" - "````````````````" @@ -2102,8 +2111,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "Finally, in this case, the `` tags are interpreted" -- "as [raw HTML] *inside*" +- "Finally, in this case, the ``" +- " tags are interpreted\nas [raw HTML] *inside*" - the CommonMark paragraph. (Because - "the tag is not on a line by itself, we get inline" - "HTML\nrather than an [HTML block].)" @@ -2115,10 +2124,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /del>

    - "````````````````" - "````````````````" -- HTML tags designed to contain literal content -- "(`pre`, `script`, `style`, `" -- "textarea`), comments, processing instructions," -- and declarations are treated somewhat differently. +- "HTML tags designed to contain literal content\n(`pre`," +- "`script`, `style`, `textarea`), comments" +- ", processing instructions,\nand declarations are treated somewhat differently." - "Instead of ending at the first blank line, these blocks" - end at the first line containing a corresponding end tag. - "As a result, these blocks can contain blank lines:" @@ -2218,8 +2226,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    baz

    " - "````````````````" - "````````````````" -- Note that anything on the last line after the -- "end tag will be included in the [HTML block]:" +- "Note that anything on the last line after the\nend tag will be included in the" +- "[HTML block]:" - "````````````````" - "````````````````" - example @@ -2317,10 +2325,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "This rule differs from John Gruber's original Markdown syntax" - "specification, which says:" -- "> The only restrictions are that block-level HTML elements —" -- "> e.g. `
    `, `" -- "`, `
    `, `

    `, etc." -- "— must be separated from\n>" +- "> The only restrictions are that block-level HTML elements —\n>" +- "e.g. `

    `, `
    `" +- ", `
    `, `

    `, etc. —" +- "must be separated from\n>" - "surrounding content by blank lines, and the start and end tags of the\n>" - block should not be indented with spaces or tabs. - "In some ways Gruber's rule is more restrictive than the one" @@ -2394,17 +2402,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- "Fortunately, blank lines are usually not necessary and can be\ndeleted." -- "The exception is inside `
    `"
    +- "Fortunately, blank lines are usually not necessary and can be"
    +- "deleted.  The exception is inside `
    `"
     - " tags, but as described\n[above][HTML blocks]"
     - ", raw HTML blocks starting with `
    `\n*can*"
     - contain blank lines.
     - "## Link reference definitions"
     - "A [link reference definition](@)"
    -- "consists of a [link label], optionally preceded by up to three spaces"
    -- "of\nindentation, followed\nby a colon (`:`"
    -- "), optional spaces or tabs (including up to one\n[line ending]"
    -- "), a [link destination],"
    +- "consists of a [link label]"
    +- ", optionally preceded by up to three spaces of\nindentation, followed"
    +- "by a colon (`:`), optional spaces or tabs ("
    +- "including up to one\n[line ending]), a [link destination],"
     - "optional spaces or tabs (including up to one\n[line ending]"
     - "), and an optional [link\ntitle]"
     - ", which if it is present must be separated\nfrom the [link destination]"
    @@ -2779,9 +2787,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "are meta-containers for [list items]."
     - We define the syntax for container blocks recursively.  The general
     - "form of the definition is:"
    -- "> If X is a sequence of blocks, then the result of"
    -- "> transforming X in such-and-such a way is a container of type"
    -- "Y\n> with these blocks as its content."
    +- "> If X is a sequence of blocks, then the result of\n>"
    +- transforming X in such-and-such a way is a container of type Y
    +- "> with these blocks as its content."
     - "So, we explain what counts as a block quote or list item by explaining"
     - how these can be *generated*
     - from their contents. This should suffice
    @@ -2790,22 +2798,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "[A parsing strategy](#appendix-a-parsing"
     - "-strategy).)"
     - "## Block quotes"
    -- "A [block quote marker](@),"
    -- "optionally preceded by up to three spaces of indentation,"
    +- "A [block quote marker](@)"
    +- ",\noptionally preceded by up to three spaces of indentation,"
     - "consists of (a) the character `>`"
     - together with a following space of
     - "indentation, or (b) a single character `>` not followed"
     - "by a space of\nindentation."
     - "The following rules define [block quotes]:"
    -- 1.  **Basic case.
    -- "**  If a string of lines *Ls* constitute a sequence\n    of blocks"
    -- "*Bs*"
    +- "1."
    +- "**Basic case.**  If a string of lines *Ls*"
    +- " constitute a sequence\n    of blocks *Bs*"
     - ", then the result of prepending a [block quote\n    marker]"
     - " to the beginning of each line in *Ls*\n    is a"
     - "[block quote](#block-quotes) containing *Bs*."
    -- 2.  **Laziness.
    -- "**  If a string of lines *Ls* constitute a"
    -- "[block\n    quote](#block-quotes) with contents *Bs*"
    +- "2."
    +- "**Laziness.**  If a string of lines"
    +- "*Ls* constitute a [block\n    quote](#block-quotes)"
    +- with contents *Bs*
     - ", then the result of deleting\n    the initial [block quote marker]"
     - from one or
     - more lines in which the next character other than a space or tab after the
    @@ -2815,9 +2824,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - is text
     - "that will be parsed as part of the content of a paragraph, but does"
     - not occur at the beginning of the paragraph.
    -- 3.  **Consecutiveness.
    -- "**  A document cannot contain two [block\n    quotes]"
    -- "in a row unless there is a [blank line] between them."
    +- "3."
    +- "**Consecutiveness.**"
    +- "  A document cannot contain two [block\n    quotes] in a row unless there is a"
    +- "[blank line] between them."
     - "Nothing else counts as a [block quote](#block-quotes)."
     - "Here is a simple example:"
     - "````````````````"
    @@ -2857,8 +2867,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    " - "````````````````" - "````````````````" -- "The Laziness clause allows us to omit the `>` before" -- "[paragraph continuation text]:" +- "The Laziness clause allows us to omit the `>`" +- " before\n[paragraph continuation text]:" - "````````````````" - "````````````````" - example @@ -2932,8 +2942,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "To see why, note that in" - "```markdown\n> foo\n> - bar\n```" -- "the `- bar` is indented too far to start a list," -- "and can't" +- "the `- bar`" +- "is indented too far to start a list, and can't" - be an indented code block because indented code blocks cannot - "interrupt paragraphs, so it is [paragraph continuation text]." - "A block quote can be empty:" @@ -2967,9 +2977,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "(Most current Markdown implementations, including John Gruber's" -- "original `Markdown.pl`, will parse this example as a" -- single block quote +- "(Most current Markdown implementations, including John Gruber's\noriginal" +- "`Markdown.pl`, will parse this example as a single" +- block quote - with two paragraphs. But it seems better to allow the author to decide - whether two block quotes or one are wanted.) - "Consecutiveness means that if we put these block quotes together," @@ -3035,8 +3045,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    baz

    " - "````````````````" - "````````````````" -- It is a consequence of the Laziness rule that any number -- "of initial `>`" +- "It is a consequence of the Laziness rule that any number\nof initial" +- "`>`" - "s may be omitted on a continuation line of a\nnested block quote:" - "````````````````" - "````````````````" @@ -3058,8 +3068,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- "When including an indented code block in a block quote," -- "remember that the [block quote marker] includes\nboth the `>`" +- "When including an indented code block in a block quote,\nremember that the" +- "[block quote marker] includes\nboth the `>`" - and a following space of indentation. So *five spaces* - " are needed\nafter the `>`:" - "````````````````" @@ -3072,22 +3082,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## List items" -- "A [list marker](@) is a" -- "[bullet list marker] or an [ordered list marker]." -- "A [bullet list marker](@)" -- "is a `-`, `+`, or `*` character." +- "A [list marker](@)" +- " is a\n[bullet list marker] or an [ordered list marker]." +- "A [bullet list marker](@)\nis a `-`," +- "`+`, or `*` character." - "An [ordered list marker](@)" -- "is a sequence of 1--9 arabic digits (`0-" -- "9`), followed by either a\n`.` character or a" -- "`)`" +- is a sequence of 1--9 arabic digits ( +- "`0-9`), followed by either a\n`.`" +- "character or a `)`" - character. (The reason for the length - limit is that with 10 digits we start seeing integer overflows - "in some browsers.)\n\nThe following rules define [list items]:" -- 1. **Basic case. -- "** If a sequence of lines *Ls* constitute a sequence of\n blocks" -- "*Bs* starting with a character other than a space or tab," -- "and *M* is\n a list marker of width *W*" -- followed by 1 ≤ *N* +- "1." +- "**Basic case.** If a sequence of lines *Ls*" +- " constitute a sequence of\n blocks *Bs*" +- "starting with a character other than a space or tab, and *M*" +- "is\n a list marker of width *W* followed by 1 ≤ *N*" - " ≤ 4 spaces of indentation,\n then the result of prepending" - "*M* and the following spaces to the first line\n of *Ls*" - ", and indenting subsequent lines of *Ls* by *W +" @@ -3096,14 +3106,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - (bullet or ordered) is determined by the type of its list marker. - "If the list item is ordered, then it is also assigned a start" - "number, based on the ordered list marker.\n\n Exceptions:" -- "1. When the first list item in a [list] interrupts" -- "a paragraph---that is, when it starts on a line that would" +- "1. When the first list item in a [list] interrupts\n a paragraph" +- "---that is, when it starts on a line that would" - "otherwise count as [paragraph continuation text]---then (a)" - "the lines *Ls* must not begin with a blank line, and (" - "b) if\n the list item is ordered, the start number must be 1." -- "2." -- "If any line is a [thematic break][thematic breaks] then" -- that line is not a list item. +- "2. If any line is a [thematic break][thematic breaks" +- "] then\n that line is not a list item." - "For example, let *Ls* be the lines" - "````````````````" - "````````````````" @@ -3193,11 +3202,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "Here `two` occurs in the same column as the list marker `1." -- "`,\nbut is actually contained in the list item, because there is" +- "Here `two` occurs in the same column as the list marker" +- "`1.`" +- ",\nbut is actually contained in the list item, because there is" - sufficient indentation after the last containing blockquote marker. -- The converse is also possible. -- "In the following example, the word `two`" +- "The converse is also possible. In the following example, the word" +- "`two`" - "occurs far to the right of the initial text of the list item," - "`one`" - ", but" @@ -3295,8 +3305,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    -1. not ok

    " - "````````````````" - "````````````````" -- 2. **Item starting with indented code. -- "** If a sequence of lines *Ls*\n constitute a sequence of blocks" +- 2. **Item starting with indented code.** +- " If a sequence of lines *Ls*\n constitute a sequence of blocks" - "*Bs* starting with an indented code\n block, and" - "*M* is a list marker of width *W*" - " followed by\n one space of indentation, then the result of prepending" @@ -3402,8 +3412,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- 3. **Item starting with a blank line. -- "** If a sequence of lines *Ls*" +- 3. **Item starting with a blank line.** +- If a sequence of lines *Ls* - "starting with a single [blank line] constitute a (possibly empty)" - "sequence of blocks *Bs*, and *M* is a list marker" - "of width *W*,\n then the result of prepending *M*" @@ -3495,8 +3505,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*

    \n

    foo\n1.

    " - "````````````````" - "````````````````" -- 4. **Indentation. -- "** If a sequence of lines *Ls*" +- 4. **Indentation.** If a sequence of lines +- "*Ls*" - constitutes a list item - "according to rule #1, #2, or #3, then the result" - "of preceding each line\n of *Ls*" @@ -3555,9 +3565,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- 5. **Laziness. -- "** If a string of lines *Ls* constitute a" -- "[list\n item](#list-items) with contents *Bs*" +- 5. **Laziness.** If a string of lines +- "*Ls* constitute a [list\n item](#list-items)" +- with contents *Bs* - ", then the result of deleting" - some or all of the indentation from one or more lines in which the - "next character other than a space or tab after the indentation is\n [" @@ -3610,13 +3620,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items)" -- "." -- The rules for sublists follow from the general rules -- "[above][List items]" -- ". A sublist must be indented the same number" +- "6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items)." +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number" - of spaces of indentation a paragraph would need to be in order to be - "included\nin the list item." - "So, in this case we need two spaces indent:" @@ -3695,27 +3703,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "### Motivation" - "John Gruber's Markdown spec says the following about list items" - ":" -- "1." -- "\"List markers typically start at the left margin, but may be indented" +- "1. \"" +- "List markers typically start at the left margin, but may be indented" - by up to three spaces. List markers must be followed by one or more - "spaces or a tab.\"" -- "2." -- "\"To make lists look nice, you can wrap items with hanging indents" -- "....\n But if you don't want to, you don'" +- "2. \"" +- "To make lists look nice, you can wrap items with hanging indents." +- "...\n But if you don't want to, you don'" - "t have to.\"" - "3. \"List items may consist of multiple paragraphs. Each subsequent" - paragraph in a list item must be indented by either 4 spaces or one - "tab.\"" -- "4." -- "\"It looks nice if you indent every line of the subsequent paragraphs," +- "4. \"" +- "It looks nice if you indent every line of the subsequent paragraphs," - "but here again, Markdown will allow you to be lazy.\"" -- "5." -- "\"To put a blockquote within a list item, the" +- "5. \"" +- "To put a blockquote within a list item, the" - "blockquote's `>`" - "delimiters need to be indented.\"" -- "6." -- "\"To put a code block within a list item, the code block needs to" -- "be\n indented twice — 8 spaces or two tabs.\"" +- "6. \"" +- "To put a code block within a list item, the code block needs to be" +- "indented twice — 8 spaces or two tabs.\"" - These rules specify that a paragraph under a list item must be indented - "four spaces (presumably, from the left margin, rather than the start of" - "the list marker, but this is not said), and that code under a" @@ -3728,8 +3736,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "block elements under a list item, including other" - "lists, must be indented four spaces. This principle has been called the" - "*four-space rule*." -- "The four-space rule is clear and principled, and if the reference" -- "implementation `Markdown.pl`" +- "The four-space rule is clear and principled, and if the reference\nimplementation" +- "`Markdown.pl`" - " had followed it, it probably would have\nbecome the standard. However," - "`Markdown.pl`" - allowed paragraphs and @@ -3827,9 +3835,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule in cases where the list marker plus its initial indentation - "takes four spaces (a common case), but diverge in other cases." - "## Lists" -- "A [list](@) is a sequence of one or more" -- "list items [of the same type]. The list items" -- may be separated by any number of blank lines. +- "A [list](@)" +- " is a sequence of one or more\nlist items [of the same type]" +- ". The list items\nmay be separated by any number of blank lines." - "Two list items are [of the same type](@)" - "if they begin with a [list marker] of the same type." - Two list markers are of the @@ -3845,7 +3853,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "of an [ordered list] is determined by the list number of" - its initial list item. The numbers of subsequent list items are - disregarded. -- "A list is [loose](@) if any of its constituent" +- "A list is [loose](@)" +- if any of its constituent - "list items are separated by blank lines, or if any of its constituent" - list items directly contain two block-level elements with a blank line - "between them. Otherwise a list is [tight](@)" @@ -3897,8 +3906,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to start lists without blank lines:" - "``` markdown\nI need to buy\n- new shoes\n- a coat" - "- a plane ticket\n```\n\nSecond, we are attracted to a" -- "> [principle of uniformity](@):" -- "> if a chunk of text has a certain\n>" +- ">" +- "[principle of uniformity](@)" +- ":\n> if a chunk of text has a certain\n>" - "meaning, it will continue to have the same meaning when put into a\n>" - container block (such as a list item or blockquote). - "(Indeed, the spec for [list items] and [block quotes]" @@ -4220,9 +4230,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with\na backtick string of equal length. The contents of the code span are" - "the characters between these two backtick strings, normalized in the\nfollowing ways:" - "- First, [line endings] are converted to [spaces]." -- "- If the resulting string both begins *and* ends with a [space]" -- " character, but does not consist entirely of [space]\n characters, a single [" -- "space] character is removed from the" +- "- If the resulting string both begins *and*" +- " ends with a [space]\n character, but does not consist entirely of [space" +- "]\n characters, a single [space] character is removed from the" - front and back. This allows you to include code that begins - "or ends with backtick characters, which must be separated by" - whitespace from the opening or closing backtick strings. @@ -4310,8 +4320,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /p> - "````````````````" - "````````````````" -- Note that browsers will typically collapse consecutive spaces -- "when rendering ``" +- "Note that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:" - "code{white-space: pre-wrap;}" - Note that backslash escapes do not work in code spans. All backslashes @@ -4324,9 +4333,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /p> - "````````````````" - "````````````````" -- "Backslash escapes are never needed, because one can always choose a" -- "string of *n* backtick characters as delimiters, where the" -- "code does\nnot contain any strings of exactly *n* backtick characters." +- "Backslash escapes are never needed, because one can always choose a\nstring of" +- "*n*" +- "backtick characters as delimiters, where the code does" +- not contain any strings of exactly *n* backtick characters. - "````````````````" - "````````````````" - example @@ -4432,9 +4442,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Emphasis and strong emphasis" -- "John Gruber's original [Markdown syntax" -- "description](https://daringfireball.net/projects/" -- "markdown/syntax#em) says:" +- "John Gruber's original" +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown" +- "/syntax#em) says:" - "> Markdown treats asterisks (`*`) and" - "underscores (`_`" - ") as indicators of\n> emphasis. Text wrapped with one `*` or" @@ -4458,8 +4469,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "entries):" - "``` markdown\n*emph *with emph* in it*" - "**strong **with strong** in it**\n```" -- Many implementations have also restricted intraword emphasis to -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - "forms, to avoid unwanted emphasis in words containing" - internal underscores. (It is best practice to put these in code - "spans, but users often do not.)" @@ -4467,25 +4477,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "no emphasis: foo_bar_baz\n```" - "The rules given below capture all of these patterns, while allowing" - for efficient parsing strategies that do not backtrack. -- "First, some definitions." -- "A [delimiter run](@)" +- "First, some definitions. A [delimiter run](@)" - " is either\na sequence of one or more `*`" - " characters that is not preceded or\nfollowed by a non-backslash-escaped" - "`*` character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped" - "`_` character." -- "A [left-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not followed by [" -- "Unicode whitespace]," +- "A [left-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not followed by" +- "[Unicode whitespace]," - "and either (2a) not followed by a [Unicode" - "punctuation character], or\n(2b) followed by a [" - "Unicode punctuation character] and\npreceded by [" - "Unicode whitespace] or a [Unicode punctuation" - "character].\nFor purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. -- "A [right-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not preceded by [" -- "Unicode whitespace]," +- "A [right-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not preceded by" +- "[Unicode whitespace]," - "and either (2a) not preceded by a [Unicode" - "punctuation character], or\n(2b) preceded by a [" - "Unicode punctuation character] and\nfollowed by [" @@ -4516,20 +4525,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - and its rules for distinguishing left- and right-flanking runs - are a bit more complex than the ones given here.) - "The following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)" -- "iff (if and only if) it is part of a [left-" -- "flanking delimiter run]." -- "2. A single `_` character [can open emphasis] iff" -- "it is part of a [left-flanking delimiter run]" +- "1." +- "A single `*` character [can open emphasis](@)" +- "iff (if and only if) it is part of a [" +- "left-flanking delimiter run]." +- "2." +- "A single `_`" +- " character [can open emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]" - "and either (a) not part of a [right-flanking" - "delimiter run]\n or (b) part of a [" - "right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character]." -- "3. A single `*` character [can close emphasis](@)" +- "3." +- "A single `*` character [can close emphasis](@)" - "iff it is part of a [right-flanking delimiter run" - "]." -- "4. A single `_` character [can close emphasis] iff" -- "it is part of a [right-flanking delimiter run]" +- "4." +- "A single `_`" +- " character [can close emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]" - "and either (a) not part of a [left-flanking" - "delimiter run]\n or (b) part of a [" - "left-flanking delimiter run]\n followed by a [" @@ -4538,8 +4553,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A double `**` [can open strong emphasis](@)" - "iff it is part of a [left-flanking delimiter run" - "]." -- "6. A double `__` [can open strong emphasis] iff" -- "it is part of a [left-flanking delimiter run]" +- "6." +- "A double `__`" +- " [can open strong emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]" - "and either (a) not part of a [right-flanking" - "delimiter run]\n or (b) part of a [" - "right-flanking delimiter run]\n preceded by a [" @@ -4548,15 +4565,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A double `**` [can close strong emphasis](@)" - "iff it is part of a [right-flanking delimiter run" - "]." -- "8. A double `__` [can close strong emphasis] iff" -- "it is part of a [right-flanking delimiter run]" +- "8." +- "A double `__`" +- " [can close strong emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]" - "and either (a) not part of a [left-flanking" - "delimiter run]\n or (b) part of a [" - "left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character]." -- "9." -- "Emphasis begins with a delimiter that [can open emphasis] and" -- "ends\n with a delimiter that [can close emphasis]" +- "9. Emphasis begins with a delimiter that [can open emphasis" +- "] and ends\n with a delimiter that [can close emphasis]" - ", and that uses the same\n character (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs" @@ -4564,10 +4582,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "open and close emphasis, then the sum of the lengths of the" - delimiter runs containing the opening and closing delimiters - "must not be a multiple of 3 unless both lengths are\n multiples of 3." -- 10. Strong emphasis begins with a delimiter that -- " [can open strong emphasis] and ends with a delimiter that\n [" -- "can close strong emphasis], and that uses the same character\n (`_`" -- "or `*`" +- "10. Strong emphasis begins with a delimiter that\n [can open strong emphasis" +- "] and ends with a delimiter that\n [can close strong emphasis]" +- ", and that uses the same character\n (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs" - "]. If one of the delimiters can both open" @@ -4575,17 +4592,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the delimiter runs containing the opening and closing - delimiters must not be a multiple of 3 unless both lengths - are multiples of 3. -- "11. A literal `*` character cannot occur at the beginning or end of" +- "11." +- "A literal `*` character cannot occur at the beginning or end of" - "`*`-delimited emphasis or `**`-" - "delimited strong emphasis, unless it\n is backslash-escaped." -- "12. A literal `_` character cannot occur at the beginning or end of" +- "12." +- "A literal `_` character cannot occur at the beginning or end of" - "`_`-delimited emphasis or `__`-" - "delimited strong emphasis, unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings," - "the following principles resolve ambiguity:" -- 13. The number of nestings should be minimized. -- "Thus, for example,\n an interpretation" -- "`...` is always preferred to" +- "13." +- "The number of nestings should be minimized. Thus, for example," +- "an interpretation `...` is always" +- preferred to - "`...`." - "14." @@ -4626,9 +4646,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">" - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `*` is followed by" -- "whitespace, and hence not part of a [left-flanking" -- "delimiter run]:" +- "This is not emphasis, because the opening `*`" +- " is followed by\nwhitespace, and hence not part of a [" +- "left-flanking delimiter run]:" - "````````````````" - "````````````````" - example @@ -4636,7 +4656,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a * foo bar*

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `*` is preceded" +- "This is not emphasis, because the opening `*`" +- is preceded - "by an alphanumeric and followed by punctuation, and hence" - "not part of a [left-flanking delimiter run]:" - "````````````````" @@ -4690,8 +4711,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">" - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `_` is followed by" -- "whitespace:" +- "This is not emphasis, because the opening `_`" +- " is followed by\nwhitespace:" - "````````````````" - "````````````````" - example @@ -4699,8 +4720,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _ foo bar_

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `_` is preceded" -- "by an alphanumeric and followed by punctuation:" +- "This is not emphasis, because the opening `_`" +- " is preceded\nby an alphanumeric and followed by punctuation:" - "````````````````" - "````````````````" - example @@ -4734,7 +4755,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - тся_

    - "````````````````" - "````````````````" -- "Here `_` does not generate emphasis, because the first delimiter run" +- "Here `_`" +- "does not generate emphasis, because the first delimiter run" - "is right-flanking and the second left-flanking:" - "````````````````" - "````````````````" @@ -4764,8 +4786,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "_foo*\n.\n

    _foo*

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the closing `*` is preceded by" -- "whitespace:" +- "This is not emphasis, because the closing `*`" +- " is preceded by\nwhitespace:" - "````````````````" - "````````````````" - example @@ -4781,8 +4803,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the second `*` is" -- preceded by punctuation and followed by an alphanumeric +- "This is not emphasis, because the second `*`" +- " is\npreceded by punctuation and followed by an alphanumeric" - "(hence it is not part of a [right-flanking delimiter" - "run]:" - "````````````````" @@ -4811,8 +4833,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Rule 4:" -- "This is not emphasis, because the closing `_` is preceded by" -- "whitespace:" +- "This is not emphasis, because the closing `_`" +- " is preceded by\nwhitespace:" - "````````````````" - "````````````````" - example @@ -4820,8 +4842,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _foo bar _

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the second `_` is" -- "preceded by punctuation and followed by an alphanumeric:" +- "This is not emphasis, because the second `_`" +- " is\npreceded by punctuation and followed by an alphanumeric:" - "````````````````" - "````````````````" - example @@ -4893,7 +4915,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ** foo bar**

    " - "````````````````" - "````````````````" -- "This is not strong emphasis, because the opening `**` is preceded" +- "This is not strong emphasis, because the opening `**`" +- is preceded - "by an alphanumeric and followed by punctuation, and hence" - "not part of a [left-flanking delimiter run]:" - "````````````````" @@ -4939,8 +4962,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo bar__

    - "````````````````" - "````````````````" -- "This is not strong emphasis, because the opening `__` is preceded" -- "by an alphanumeric and followed by punctuation:" +- "This is not strong emphasis, because the opening `__`" +- " is preceded\nby an alphanumeric and followed by punctuation:" - "````````````````" - "````````````````" - example @@ -5005,8 +5028,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "(Nor can it be interpreted as an emphasized `*foo bar *`" - ", because of\nRule 11.)" -- "This is not strong emphasis, because the second `**` is" -- "preceded by punctuation and followed by an alphanumeric:" +- "This is not strong emphasis, because the second `**`" +- " is\npreceded by punctuation and followed by an alphanumeric:" - "````````````````" - "````````````````" - example @@ -5064,8 +5087,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    __foo bar __

    " - "````````````````" - "````````````````" -- "This is not strong emphasis, because the second `__` is" -- "preceded by punctuation and followed by an alphanumeric:" +- "This is not strong emphasis, because the second `__`" +- " is\npreceded by punctuation and followed by an alphanumeric:" - "````````````````" - "````````````````" - example @@ -5191,8 +5214,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown" - "

    foobar" - "baz\n```" -- is precluded by the condition that a delimiter that +- ">\n```\n\n\nis precluded by the condition that a delimiter that" - "can both open and close (like the `*` after `foo`" - ")\ncannot form emphasis if the sum of the lengths of" - the delimiter runs containing the opening and @@ -5234,8 +5256,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- When the lengths of the interior closing and opening -- delimiter runs are *both* +- "When the lengths of the interior closing and opening\ndelimiter runs are" +- "*both*" - " multiples of 3, though,\nthey can match to create emphasis:" - "````````````````" - "````````````````" @@ -5807,14 +5829,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[inline links] the" - "destination and title are given immediately after the link text. In\n[reference links]" - " the destination and title are defined elsewhere in\nthe document." -- "A [link text](@) consists of a sequence of zero or more" -- "inline elements enclosed by square brackets (`[` and `]`)" -- ". The\nfollowing rules apply:" +- "A [link text](@)" +- " consists of a sequence of zero or more\ninline elements enclosed by square brackets (" +- "`[` and `]`). The\nfollowing rules apply:" - "- Links may not contain other links, at any level of nesting. If" - multiple otherwise valid link definitions appear nested inside each - "other, the inner-most definition is used." -- "- Brackets are allowed in the [link text] only if (a" -- ) they +- "- Brackets are allowed in the [link text]" +- only if (a) they - are backslash-escaped or (b) they appear as a matched pair of - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket" @@ -5824,13 +5846,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "than the brackets in link text. Thus, for example," - "`` [foo`]` `` could not be a link text" - ", since the second `]`\n is part of a code span." -- "- The brackets in link text bind more tightly than markers for" -- "[emphasis and strong emphasis]. Thus, for example," +- "- The brackets in link text bind more tightly than markers for\n [" +- "emphasis and strong emphasis]. Thus, for example," - "`*[foo*](url)` is a link." - "A [link destination](@) consists of either" -- "- a sequence of zero or more characters between an opening `<` and a" -- " closing `>` that contains no line endings or unescaped\n `<`" -- "or `>` characters, or" +- "-" +- "a sequence of zero or more characters between an opening `<` and a\n closing" +- "`>` that contains no line endings or unescaped\n `<` or" +- "`>` characters, or" - "- a nonempty sequence of characters that does not start with `<`" - ",\n does not include [ASCII control characters][" - "ASCII control character]\n or [space]" @@ -5840,17 +5863,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - (Implementations may impose limits on parentheses nesting to - "avoid performance issues, but at least three levels of nesting\n should be supported.)" - "A [link title](@) consists of either" -- "- a sequence of zero or more characters between straight double-quote" -- "characters (`\"`), including a `\"` character only if it" -- "is\n backslash-escaped, or" -- "- a sequence of zero or more characters between straight single-quote" -- "characters (`'`), including a `'` character only if it" -- "is\n backslash-escaped, or" -- "- a sequence of zero or more characters between matching parentheses" -- "(`(...)`), including a `(` or" -- "`)` character only if it is\n backslash-escaped." -- "Although [link titles] may span multiple lines, they may not contain" -- "a [blank line]." +- "-" +- "a sequence of zero or more characters between straight double-quote\n characters (" +- "`\"`), including a `\"`" +- " character only if it is\n backslash-escaped, or" +- "-" +- "a sequence of zero or more characters between straight single-quote\n characters (" +- "`'`), including a `'`" +- " character only if it is\n backslash-escaped, or" +- "-" +- "a sequence of zero or more characters between matching parentheses\n (" +- "`(...)`), including a `(` or `" +- ")` character only if it is\n backslash-escaped." +- "Although [link titles] may span multiple lines, they may not contain\na [" +- "blank line]." - "An [inline link](@) consists of a [link text]" - "followed immediately\nby a left parenthesis `(`" - ", an optional [link destination], an optional\n[link title]" @@ -5860,9 +5886,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "line\nending.\nIf both [link destination] and [link title]" - "are present, they *must*" - " be\nseparated by spaces, tabs, and up to one line ending." -- "The link's text consists of the inlines contained" -- "in the [link text] (excluding the enclosing square brackets)" -- ".\nThe link'" +- "The link's text consists of the inlines contained\nin the [link text" +- "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing" - "`<...>` if present, with backslash-escapes in" - "effect as described\nabove. The link'" @@ -6290,8 +6315,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo *bar

    - "````````````````" - "````````````````" -- "Note that brackets that *aren't* part of links do not take" -- "precedence:" +- "Note that brackets that *aren't*" +- " part of links do not take\nprecedence:" - "````````````````" - "````````````````" - example @@ -6367,8 +6392,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo

    " - "````````````````" - "````````````````" -- "The rules for the [link text] are the same as with" -- "[inline links]. Thus:" +- "The rules for the [link text] are the same as with\n[" +- "inline links]. Thus:" - "The link text may contain balanced brackets, but not unbalanced ones," - "unless they are escaped:" - "````````````````" @@ -6551,8 +6576,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "a single reference link, not two shortcut reference links, as\nintended:" - "``` markdown\n[foo]\n[bar]" - "[foo]: /url1" -- "[bar]: /url2\n```" -- "(Note that [shortcut reference links] were introduced by Gruber" +- "[bar]: /url2\n```\n\n(Note that [" +- "shortcut reference links] were introduced by Gruber" - "himself in a beta version of `Markdown.pl`, but never" - "included\nin the official syntax description. Without shortcut reference" - "links, it is harmless to allow space between the link text and" @@ -7066,8 +7091,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "=\"Foo\" title=\"title\" />

    " - "````````````````" - "````````````````" -- "If you just want a literal `!" -- "` followed by bracketed text, you can\nbackslash-escape the opening" +- "If you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening" - "`[`:" - "````````````````" - "````````````````" @@ -7077,8 +7102,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ![foo]

    " - "````````````````" - "````````````````" -- "If you want a link after a literal `!" -- "`, backslash-escape the\n`!`:" +- "If you want a link after a literal `!`" +- ", backslash-escape the\n`!`:" - "````````````````" - "````````````````" - example @@ -7098,9 +7123,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". It is parsed as" - "a link to the URI, with the URI as the link's" - label. -- "An [absolute URI](@)," -- "for these purposes, consists of a [scheme] followed by a colon (" -- "`:`" +- "An [absolute URI](@)" +- ",\nfor these purposes, consists of a [scheme]" +- "followed by a colon (`:`" - ")\nfollowed by zero or more characters other than [ASCII control\ncharacters]" - "[ASCII control character], [space], `<`," - "and `>`" @@ -7216,18 +7241,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - p> - "````````````````" - "````````````````" -- "An [email autolink](@)" -- "consists of `<`, followed by an [email address],\nfollowed by" -- "`>`" +- "An [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is" - "`mailto:` followed by the email address." -- "An [email address](@)," -- "for these purposes, is anything that matches\nthe" -- "[non-normative regex from the HTML5" -- "spec](https://" -- html.spec.whatwg.org/multipage/ -- "forms.html#e-mail-state-(type=email))" -- ":" +- "An [email address](@)" +- ",\nfor these purposes, is anything that matches\nthe" +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org" +- "/multipage/forms.html#e-mail-state-(type" +- "=email)):" - "/^[a-zA-Z0-9.!" - "#$%&'*+/=?" - "^_`{|}~-]+@[a-zA" @@ -7320,16 +7343,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Tag and attribute names are not limited to current HTML tags," - "so custom tags (and even, say, DocBook tags) may be" - "used.\n\nHere is the grammar for tags:" -- "A [tag name](@) consists of an ASCII letter" +- "A [tag name](@)" +- consists of an ASCII letter - "followed by zero or more ASCII letters, digits, or" - "hyphens (`-`)." - "An [attribute](@) consists of spaces, tabs, and up" - "to one line ending,\nan [attribute name], and an optional\n[" - "attribute value specification]." -- "An [attribute name](@)" -- "consists of an ASCII letter, `_`, or `:`" -- ", followed by zero or more ASCII\nletters, digits, `_`" -- ", `.`, `:`, or `-`" +- "An [attribute name](@)\nconsists of an ASCII letter," +- "`_`, or `:`, followed by zero or more" +- "ASCII\nletters, digits, `_`, `.`," +- "`:`, or `-`" - ". (Note: This is the XML" - specification restricted to ASCII. - HTML5 is laxer.) @@ -7346,12 +7370,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "including spaces, tabs, line endings, `\"`, `'`" - ", `=`, `<`, `>`, or ``" - "` ``." -- "A [single-quoted attribute value](@)" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`." -- "A [double-quoted attribute value](@)" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`." +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final" +- "`'`." +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final" +- "`\"`." - "An [open tag](@) consists of a `<` character," - "a [tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional" @@ -7367,14 +7391,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[HTML spec](https://" - html.spec.whatwg.org/multipage/ - "parsing.html#markup-declaration-open-state))." -- "A [processing instruction](@)\nconsists of the string ``" +- "A [processing instruction](@)\nconsists of the string ``" - ", and the string\n`?>`." -- "A [declaration](@) consists of the string ``, and the character `>`." -- "A [CDATA section](@) consists of" -- "the string ``" - ", and the string `]]>`." - "An [HTML tag](@) consists of an [open tag" @@ -7589,8 +7613,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - baz

    - "````````````````" - "````````````````" -- "For a more visible alternative, a backslash before the" -- "[line ending] may be used instead of two or more spaces:" +- "For a more visible alternative, a backslash before the\n[line ending]" +- "may be used instead of two or more spaces:" - "````````````````" - "````````````````" - example @@ -7789,8 +7813,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Each line that is processed has an effect on this tree. The line is - "analyzed and, depending on its contents, the document may be altered" - "in one or more of the following ways:" -- "1. One or more open blocks may be closed.\n2." -- "One or more new blocks may be created as children of the\n last open block." +- 1. One or more open blocks may be closed. +- 2. One or more new blocks may be created as children of the +- last open block. - 3. Text may be added to the last (deepest) open block remaining - on the tree. - "Once a line has been incorporated into the tree in this way," @@ -7832,16 +7857,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` tree\n-> document\n```" - "The first line of our text," - "``` markdown\n> Lorem ipsum dolor\n```" -- "causes a `block_quote` block to be created as a child of our" -- "open `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" +- "causes a `block_quote`" +- " block to be created as a child of our\nopen `document`" +- " block, and a `paragraph` block as a child of\nthe" +- "`block_quote`" - ". Then the text is added to the last open\nblock, the `paragraph`" - ":" - "``` tree\n-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\"\n```\n\nThe next line," - "``` markdown\nsit amet.\n```" -- "is a \"lazy continuation\" of the open `paragraph`, so it gets" -- "added\nto the paragraph's text:" +- "is a \"lazy continuation\" of the open `paragraph`" +- ", so it gets added\nto the paragraph's text:" - "``` tree\n-> document\n -> block_quote\n -> paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "```\n\nThe third line," @@ -7860,8 +7886,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"Qui *quodsi iracundia*\"" - "```\n\nThe fourth line," - "``` markdown\n> - aliquando id\n```" -- "causes the `list_item` (and its child the `paragraph`)" -- "to be closed,\nand a new `list_item`" +- "causes the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" - "opened up as child of the `list`. A `paragraph`" - "is added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:" @@ -7884,10 +7910,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - list (type=bullet tight=true bullet_char=-) - " list_item\n paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item\n paragraph" -- " str \"aliquando id\"\n```" -- "Notice how the [line ending] in the first paragraph has" -- "been parsed as a `softbreak`, and the asterisks" -- "in the first list item\nhave become an `emph`." +- " str \"aliquando id\"\n```\n\nNotice how the" +- "[line ending] in the first paragraph has\nbeen parsed as a" +- "`softbreak`" +- ", and the asterisks in the first list item\nhave become an" +- "`emph`." - "### An algorithm for parsing nested emphasis and links" - "By far the trickiest part of inline parsing is handling emphasis," - "strong emphasis, links, and images. This is done using the following\nalgorithm." @@ -7910,8 +7937,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "When we hit a `]` character, we call the *look for link" - "or image*\nprocedure (see below)." - "When we hit the end of the input, we call the *process emphasis*" -- "procedure (see below), with `stack_bottom` = NULL" -- "." +- "procedure (see below), with `stack_bottom`" +- "= NULL." - "#### *look for link or image*" - "Starting at the top of the delimiter stack, we look backwards" - "through the stack for an opening `[` or `![`" @@ -7926,8 +7953,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - to see if - "we have an inline link/image, reference link/image, collapsed reference" - "link/image, or shortcut reference link/image." -- "+ If we don't, then we remove the opening delimiter from" -- "the\n delimiter stack and return a literal text node `]`." +- "+ If we don'" +- "t, then we remove the opening delimiter from the" +- "delimiter stack and return a literal text node `]`." - "+ If we do, then" - "* We return a link or image node whose children are the inlines" - after the text node pointed to by the opening delimiter. @@ -7938,17 +7966,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "all\n `[` delimiters before the opening delimiter to" - "*inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*" -- "Parameter `stack_bottom` sets a lower bound to how far we" -- "descend in the [delimiter stack]" -- ". If it is NULL, we can" +- "Parameter `stack_bottom`" +- " sets a lower bound to how far we\ndescend in the [delimiter stack" +- "]. If it is NULL, we can" - "go all the way to the bottom. Otherwise, we stop before\nvisiting" - "`stack_bottom`." - "Let `current_position` point to the element on the [delimiter" - "stack]\njust above `stack_bottom` (or the first element if" - "`stack_bottom`\nis NULL)." -- "We keep track of the `openers_bottom` for each delimiter" -- "type (`*`, `_`), indexed to the length" -- of the closing delimiter run +- "We keep track of the `openers_bottom`" +- " for each delimiter\ntype (`*`, `_`" +- "), indexed to the length of the closing delimiter run" - (modulo 3) and to whether the closing delimiter can also - "be an\nopener. Initialize this to `stack_bottom`." - "Then we repeat the following until we run out of potential\nclosers:" @@ -7957,8 +7985,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "or `_`" - ".\n (This will be the potential closer closest\n to the beginning of the input" - "-- the first one in parse order.)" -- "- Now, look back in the stack (staying above `stack_bottom`" -- "and\n the `openers_bottom`" +- "-" +- "Now, look back in the stack (staying above `stack_bottom`" +- " and\n the `openers_bottom`" - " for this delimiter type) for the\n first matching potential opener (\"matching" - "\" means same delimiter).\n\n- If one is found:" - "+ Figure out whether we have emphasis or strong emphasis:" @@ -7968,18 +7997,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the text node corresponding to the opener. - + Remove any delimiters between the opener and closer from - the delimiter stack. -- + Remove 1 (for regular emph) or 2 (for strong +- + +- Remove 1 (for regular emph) or 2 (for strong - emph) delimiters - from the opening and closing text nodes. If they become empty - "as a result, remove them and remove the corresponding element" - "of the delimiter stack. If the closing node is removed, reset" - "`current_position` to the next element in the stack." - "- If none is found:" -- "+ Set `openers_bottom` to the element before `current_position" -- "`." +- + +- "Set `openers_bottom` to the element before `current_position`" +- "." - (We know that there are no openers for this kind of closer up to - "and\n including this point, so this puts a lower bound on future searches.)" -- "+ If the closer at `current_position` is not a potential opener," +- + +- "If the closer at `current_position`" +- "is not a potential opener," - "remove it from the delimiter stack (since we know it can't" - be a closer either). - "+ Advance `current_position` to the next element in the stack." diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap index ef6c06d..1a85b9e 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap @@ -77,9 +77,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "+ Very easy!\n```" - 1. First ordered list item - "2. Another item\n⋅⋅* Unordered sub-list." -- "1." -- "Actual numbers don't matter, just that it's a number" -- "⋅⋅1. Ordered sub-list\n4. And another item." +- "1. Actual numbers don't matter, just that it'" +- "s a number\n⋅⋅1. Ordered sub-list" +- 4. And another item. - ⋅⋅⋅You can have properly indented paragraphs within list items - "." - "Notice the blank line above, and the leading spaces (at least one," @@ -204,7 +204,8 @@ input_file: tests/inputs/markdown/github_flavored.md - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"\n```" - "Here's our logo (hover to see the title text):" -- "Inline-style:\n![" +- "Inline-style:" +- "![" - "alt text](https://github.com/" - adam-p/markdown-here/raw/master/src - "/common/images/icon48.png \"Logo Title Text 1" @@ -396,12 +397,13 @@ input_file: tests/inputs/markdown/github_flavored.md - s keep writing to make sure this is long enough to actually wrap for everyone. - "Oh, you can *put* **Markdown** into a" - blockquote. -- "> Blockquotes can also be nested..." -- ">> ...by using additional greater-than signs right next to each" -- "other...\n> > > ...or with spaces between arrows." +- "> Blockquotes can also be nested...\n>" +- "> ...by using additional greater-than signs right next to each other" +- "...\n> > > ...or with spaces between arrows." - "------" - "# Inline HTML" -- "```\n
    " +- "```" +- "
    " - "
    Definition list
    " - "
    Is something people use sometimes.
    " - "
    Markdown in HTML
    " @@ -416,8 +418,9 @@ input_file: tests/inputs/markdown/github_flavored.md - Use HTML tags.\n
    \n\n------" - "# Horizontal Rules" -- "```\nThree or more...\n\n---\n\nHyphens" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n```" +- "```" +- "Three or more...\n\n---\n\nHyphens\n\n***" +- "Asterisks\n\n___\n\nUnderscores\n```" - "Three or more...\n\n---\n\nHyphens\n\n***" - "Asterisks\n\n___\n\nUnderscores\n\n------" - "# YouTube Videos" @@ -444,7 +447,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "jpg\" alt=\"IMAGE ALT TEXT" - "HERE\" width=\"240\" height=\"180\" border=" - "\"10\">\n" -- "```\n[![" +- "```" +- "[![" - "IMAGE ALT TEXT HERE](http:/" - /img.youtube.com/vi/ - YOUTUBE_VIDEO_ID_HERE/0. @@ -452,11 +456,10 @@ input_file: tests/inputs/markdown/github_flavored.md - www.youtube.com/watch? - v=YOUTUBE_VIDEO_ID_HERE) - "```" -- "[![" -- "IMAGE ALT TEXT HERE](https:/" -- /upload.wikimedia.org/wikipedia/ -- commons/thumb/e/ef/YouTube_logo_2015. -- svg/1200px-YouTube_logo_2015. -- "svg.png)](https://" -- www.youtube.com/watch? +- "[![IMAGE ALT TEXT HERE" +- "](https://upload.wikimedia.org/" +- wikipedia/commons/thumb/e/ef/ +- YouTube_logo_2015.svg/1200px- +- "YouTube_logo_2015.svg.png)](https" +- "://www.youtube.com/watch?" - v=ciawICBvQoE) diff --git a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap index 0987525..b5aae28 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap @@ -12,8 +12,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ") for converting Markdown to\nHTML. In the next decade, dozens of implementations were\ndeveloped in many languages. Some extended the original\nMarkdown syntax with conventions for footnotes, tables, and\n" - "other document elements. Some allowed Markdown documents to be\nrendered in formats other than HTML. Websites like Reddit,\nStackOverflow, and GitHub had millions of people using Markdown.\nAnd Markdown started to be used beyond the web, to author books,\n" - "articles, slide shows, letters, and lecture notes.\n\nWhat distinguishes Markdown from many other lightweight markup\nsyntaxes, which are often easier to write, is its readability.\nAs Gruber writes:\n\n" -- "> The overriding design goal for Markdown's formatting syntax is\n> to make it as readable as possible. The idea is that a\n> Markdown-formatted document should be publishable as-is, as\n> plain text, without looking like it's been marked up with tags\n" -- "> or formatting instructions.\n> ()\n\n" +- "> The overriding design goal for Markdown's formatting syntax is\n> to make it as readable as possible. The idea is that a\n> Markdown-formatted document should be publishable as-is, as\n> plain text, without looking like it's been marked up with tags\n> " +- "or formatting instructions.\n> ()\n\n" - "The point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n" - "\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. " - "List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\n" @@ -23,20 +23,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " This paragraph belongs to item two of the outer list.\n```\n\n" - "The AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document.\n\n" - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n" -- " they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See " +- "1. " +- "How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but " +- "`Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See " - "[this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n" -- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n" -- " also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken " +- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n " +- "also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken " - "[in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n" - "3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n" -- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n" -- " Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```" +- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```" +- "\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```" - "\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n" - "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n" - "6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n" -- " - fum\n ```\n\n" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)" +- "\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n" - "8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n" - "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n" - "10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n" @@ -47,15 +48,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\n" - "renders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away.\n\n" - "## About this document\n\n" -- "This document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\n" -- "against any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n" +- "This document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py`" +- " can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n" - "\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\n" - "of representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer.\n\n" - "Note that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\n" - "implementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\n" - "percent-encode non-ASCII characters in URLs.\n\n" -- "This document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\n" -- "HTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs.\n\n" +- "This document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt`" +- " into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs.\n\n" - "# Preliminaries\n\n" - "## Characters and lines\n\nAny sequence of [characters] is a valid CommonMark\ndocument.\n" - "\nA [character](@) is a Unicode code point. Although some\ncode points (for example, combining accents) do not correspond to\ncharacters in an intuitive sense, all code points count as characters\nfor purposes of this spec.\n" @@ -65,8 +66,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nA line containing no characters, or a line containing only spaces\n(`U+0020`) or tabs (`U+0009`), is called a [blank line](@).\n\nThe following definitions of character classes will be used in this spec:\n" - "\nA [Unicode whitespace character](@) is a character in the Unicode `Zs` general\ncategory, or a tab (`U+0009`), line feed (`U+000A`), form feed (`U+000C`), or\ncarriage return (`U+000D`).\n" - "\n[Unicode whitespace](@) is a sequence of one or more\n[Unicode whitespace characters].\n\nA [tab](@) is `U+0009`.\n\nA [space](@) is `U+0020`.\n\nAn [ASCII control character](@) is a character between `U+0000–1F` (both\nincluding) or `U+007F`.\n" -- "\nAn [ASCII punctuation character](@)\nis `!`, `\"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`,\n`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), \n`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040),\n`[`, `\\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), \n" -- "`{`, `|`, `}`, or `~` (U+007B–007E).\n\nA [Unicode punctuation character](@) is a character in the Unicode `P`\n(puncuation) or `S` (symbol) general categories.\n\n" +- "\nAn [ASCII punctuation character](@)\nis `!`, `\"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`,\n`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), \n`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040),\n`[`, `\\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), \n`{`, `|`, `}`, or `~`" +- " (U+007B–007E).\n\nA [Unicode punctuation character](@) is a character in the Unicode `P`\n(puncuation) or `S` (symbol) general categories.\n\n" - "## Tabs\n\nTabs in lines are not expanded to [spaces]. However,\nin contexts where spaces help to define block structure,\ntabs behave as if they were replaced by spaces with a tab stop\nof 4 characters.\n" - "\nThus, for example, a tab can be used instead of four spaces\nin an indented code block. (Note, however, that internal\ntabs are passed through as literal tabs, not expanded to\nspaces.)\n" - "\n```````````````````````````````` example\n→foo→baz→→bim\n.\n

    foo→baz→→bim\n
    \n````````````````````````````````" @@ -75,8 +76,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\nIn the following example, a continuation paragraph of a list\nitem is indented with a tab; this has exactly the same effect\nas indentation with four spaces would:\n" - "\n```````````````````````````````` example\n - foo\n\n→bar\n.\n
      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````" - "\n\n```````````````````````````````` example\n- foo\n\n→→bar\n.\n
      \n
    • \n

      foo

      \n
        bar\n
      \n
    • \n
    \n````````````````````````````````" -- "\n\nNormally the `>` that begins a block quote may be followed\noptionally by a space, which is not considered part of the\ncontent. In the following case `>` is followed by a tab,\nwhich is treated as if it were expanded into three spaces.\n" -- "Since one of these spaces is considered part of the\ndelimiter, `foo` is considered to be indented six spaces\ninside the block quote context, so we get an indented\ncode block starting with two spaces.\n\n" +- "\n\nNormally the `>` that begins a block quote may be followed\noptionally by a space, which is not considered part of the\ncontent. In the following case `>`" +- " is followed by a tab,\nwhich is treated as if it were expanded into three spaces.\nSince one of these spaces is considered part of the\ndelimiter, `foo`" +- " is considered to be indented six spaces\ninside the block quote context, so we get an indented\ncode block starting with two spaces.\n\n" - "```````````````````````````````` example\n>→→foo\n.\n
    \n
      foo\n
    \n
    \n````````````````````````````````" - "\n\n```````````````````````````````` example\n-→→foo\n.\n
      \n
    • \n
        foo\n
      \n
    • \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n foo\n→bar\n.\n
    foo\nbar\n
    \n````````````````````````````````" @@ -99,15 +101,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n[foo]\n\n[foo]: /bar\\* \"ti\\*tle\"\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n``` foo\\+bar\nfoo\n```\n.\n
    foo\n
    \n````````````````````````````````\n\n\n" - "## Entity and numeric character references\n\nValid HTML entity references and numeric character references\ncan be used in place of the corresponding Unicode character,\nwith the following exceptions:\n\n" -- "- Entity and character references are not recognized in code\n blocks and code spans.\n\n- Entity and character references cannot stand in place of\n special characters that define structural elements in\n CommonMark. " -- "For example, although `*` can be used\n in place of a literal `*` character, `*` cannot replace\n `*` in emphasis delimiters, bullet list markers, or thematic\n breaks.\n\n" -- "Conforming CommonMark parsers need not store information about\nwhether a particular character was represented in the source\nusing a Unicode character or an entity reference.\n" +- "- Entity and character references are not recognized in code\n blocks and code spans.\n\n" +- "- " +- "Entity and character references cannot stand in place of\n special characters that define structural elements in\n CommonMark. For example, although `*` can be used\n in place of a literal `*` character, `*` cannot replace\n `*`" +- " in emphasis delimiters, bullet list markers, or thematic\n breaks.\n\nConforming CommonMark parsers need not store information about\nwhether a particular character was represented in the source\nusing a Unicode character or an entity reference.\n" - "\n[Entity references](@) consist of `&` + any of the valid\nHTML5 entity names + `;`. The\ndocument \nis used as an authoritative source for the valid entity\nreferences and their corresponding code points.\n" - "\n```````````````````````````````` example\n  & © Æ Ď\n¾ ℋ ⅆ\n∲ ≧̸\n.\n

      & © Æ Ď\n¾ ℋ ⅆ\n∲ ≧̸

    \n````````````````````````````````" -- "\n\n\n[Decimal numeric character\nreferences](@)\nconsist of `&#` + a string of 1--7 arabic digits + `;`. A\nnumeric character reference is parsed as the corresponding\nUnicode character. Invalid Unicode code points will be replaced by\n" -- "the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons,\nthe code point `U+0000` will also be replaced by `U+FFFD`.\n\n```````````````````````````````` example\n# Ӓ Ϡ �\n.\n

    # Ӓ Ϡ �

    \n````````````````````````````````" -- "\n\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#` +\neither `X` or `x` + a string of 1-6 hexadecimal digits + `;`.\nThey too are parsed as the corresponding Unicode character (this\n" -- "time specified with a hexadecimal numeral instead of decimal).\n\n```````````````````````````````` example\n" ആ ಫ\n.\n

    " ആ ಫ

    \n````````````````````````````````\n\n\nHere are some nonentities:\n" +- "\n\n\n[Decimal numeric character\nreferences](@)\nconsist of `&#` + a string of 1--7 arabic digits + `;`. A\nnumeric character reference is parsed as the corresponding\nUnicode character. Invalid Unicode code points will be replaced by\nthe REPLACEMENT CHARACTER (" +- "`U+FFFD`). For security reasons,\nthe code point `U+0000` will also be replaced by `U+FFFD`.\n\n```````````````````````````````` example\n# Ӓ Ϡ �\n.\n

    # Ӓ Ϡ �

    \n````````````````````````````````" +- "\n\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#` +\neither `X` or `x` + a string of 1-6 hexadecimal digits + `;`" +- ".\nThey too are parsed as the corresponding Unicode character (this\ntime specified with a hexadecimal numeral instead of decimal).\n\n```````````````````````````````` example\n" ആ ಫ\n.\n

    " ആ ಫ

    \n````````````````````````````````" +- "\n\n\nHere are some nonentities:\n" - "\n```````````````````````````````` example\n  &x; &#; &#x;\n�\n&#abcdef0;\n&ThisIsNotDefined; &hi?;\n.\n

    &nbsp &x; &#; &#x;\n&#87654321;\n&#abcdef0;\n&ThisIsNotDefined; &hi?;

    \n````````````````````````````````" - "\n\n\nAlthough HTML5 does accept some entity references\nwithout a trailing semicolon (such as `©`), these are not\nrecognized here, because it makes the grammar too ambiguous:\n" - "\n```````````````````````````````` example\n©\n.\n

    &copy

    \n````````````````````````````````\n\n\nStrings that are not on the list of HTML5 named entities are not\nrecognized as entity references either:\n" @@ -123,13 +127,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n```````````````````````````````` example\n* foo\n\n* foo\n.\n

    * foo

    \n
      \n
    • foo
    • \n
    \n````````````````````````````````\n\n```````````````````````````````` example\nfoo bar\n.\n

    foo\n\nbar

    \n````````````````````````````````" - "\n\n```````````````````````````````` example\n foo\n.\n

    →foo

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[a](url "tit")\n.\n

    [a](url "tit")

    \n````````````````````````````````\n\n\n\n" - "# Blocks and inlines\n\n" -- "We can think of a document as a sequence of\n[blocks](@)---structural elements like paragraphs, block\nquotations, lists, headings, rules, and code blocks. Some blocks (like\nblock quotes and list items) contain other blocks; others (like\n" -- "headings and paragraphs) contain [inline](@) content---text,\nlinks, emphasized text, images, code spans, and so on.\n\n" +- "We can think of a document as a sequence of\n[blocks](@)" +- "---structural elements like paragraphs, block\nquotations, lists, headings, rules, and code blocks. Some blocks (like\nblock quotes and list items) contain other blocks; others (like\nheadings and paragraphs) contain [inline](@)" +- " content---text,\nlinks, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\nIndicators of block structure always take precedence over indicators\nof inline structure. So, for example, the following is a list with\ntwo items, not a list with one item containing a code span:\n" - "\n```````````````````````````````` example\n- `one\n- two`\n.\n
      \n
    • `one
    • \n
    • two`
    • \n
    \n````````````````````````````````" -- "\n\n\nThis means that parsing can proceed in two steps: first, the block\nstructure of the document can be discerned; second, text lines inside\nparagraphs, headings, and other block constructs can be parsed for inline\nstructure. " -- "The second step requires information about link reference\ndefinitions that will be available only at the end of the first\nstep. Note that the first step requires processing lines in sequence,\nbut the second can be parallelized, since the inline parsing of" -- "\none block element does not affect the inline parsing of any other.\n\n" +- "\n\n\nThis means that parsing can proceed in two steps: first, the block\nstructure of the document can be discerned; second, text lines inside\nparagraphs, headings, and other block constructs can be parsed for inline\n" +- "structure. The second step requires information about link reference\ndefinitions that will be available only at the end of the first\nstep. Note that the first step requires processing lines in sequence,\n" +- "but the second can be parallelized, since the inline parsing of\none block element does not affect the inline parsing of any other.\n\n" - "## Container blocks and leaf blocks\n\nWe can divide blocks into two types:\n[container blocks](#container-blocks),\nwhich can contain other blocks, and [leaf blocks](#leaf-blocks),\nwhich cannot.\n\n" - "# Leaf blocks\n\nThis section describes the different kinds of leaf block that make up a\nMarkdown document.\n\n" - "## Thematic breaks\n\nA line consisting of optionally up to three spaces of indentation, followed by a\nsequence of three or more matching `-`, `_`, or `*` characters, each followed\noptionally by any number of spaces or tabs, forms a\n[thematic break](@).\n" @@ -150,8 +155,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n* Foo\n* * *\n* Bar\n.\n
      \n
    • Foo
    • \n
    \n
    \n
      \n
    • Bar
    • \n
    \n````````````````````````````````\n\n\nIf you want a thematic break in a list item, use a different bullet:\n" - "\n```````````````````````````````` example\n- Foo\n- * * *\n.\n
      \n
    • Foo
    • \n
    • \n
      \n
    • \n
    \n````````````````````````````````\n\n\n" - "## ATX headings\n\n" -- "An [ATX heading](@)\nconsists of a string of characters, parsed as inline content, between an\nopening sequence of 1--6 unescaped `#` characters and an optional\nclosing sequence of any number of unescaped `#` characters.\n" -- "The opening sequence of `#` characters must be followed by spaces or tabs, or\nby the end of line. The optional closing sequence of `#`s must be preceded by\nspaces or tabs and may be followed by spaces or tabs only. The opening\n`#`" +- "An [ATX heading](@)\nconsists of a string of characters, parsed as inline content, between an\nopening sequence of 1--6 unescaped `#` characters and an optional\nclosing sequence of any number of unescaped `#` characters.\nThe opening sequence of `#`" +- " characters must be followed by spaces or tabs, or\nby the end of line. The optional closing sequence of `#`s must be preceded by\nspaces or tabs and may be followed by spaces or tabs only. The opening\n`#`" - " character may be preceded by up to three spaces of indentation. The raw\ncontents of the heading are stripped of leading and trailing space or tabs\nbefore being parsed as inline content. The heading level is equal to the number\nof `#`" - " characters in the opening sequence.\n\nSimple headings:\n" - "\n```````````````````````````````` example\n# foo\n## foo\n### foo\n#### foo\n##### foo\n###### foo\n.\n

    foo

    \n

    foo

    \n

    foo

    \n

    foo

    \n
    foo
    \n
    foo
    \n````````````````````````````````\n\n\nMore than six `#` characters is not a heading:\n" @@ -175,11 +180,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\nFoo bar\n# baz\nBar foo\n.\n

    Foo bar

    \n

    baz

    \n

    Bar foo

    \n````````````````````````````````\n\n\nATX headings can be empty:\n" - "\n```````````````````````````````` example\n## \n#\n### ###\n.\n

    \n

    \n

    \n````````````````````````````````\n\n\n" - "## Setext headings\n\n" -- "A [setext heading](@) consists of one or more\nlines of text, not interrupted by a blank line, of which the first line does not\nhave more than 3 spaces of indentation, followed by\na [setext heading underline]. The lines of text must be such\n" +- "A [setext heading](@)" +- " consists of one or more\nlines of text, not interrupted by a blank line, of which the first line does not\nhave more than 3 spaces of indentation, followed by\na [setext heading underline]. The lines of text must be such\n" - "that, were they not followed by the setext heading underline,\nthey would be interpreted as a paragraph: they cannot be\ninterpretable as a [code fence], [ATX heading][ATX headings],\n[block quote][block quotes], [thematic break][thematic breaks],\n[list item" - "][list items], or [HTML block][HTML blocks].\n\nA [setext heading underline](@) is a sequence of\n`=` characters or a sequence of `-` characters, with no more than 3\nspaces of indentation and any number of trailing spaces or tabs.\n" -- "\nThe heading is a level 1 heading if `=` characters are used in\nthe [setext heading underline], and a level 2 heading if `-`\ncharacters are used. The contents of the heading are the result\nof parsing the preceding lines of text as CommonMark inline\n" -- "content.\n\nIn general, a setext heading need not be preceded or followed by a\nblank line. However, it cannot interrupt a paragraph, so when a\nsetext heading comes after a paragraph, a blank line is needed between\nthem.\n\nSimple examples:\n" +- "\nThe heading is a level 1 heading if `=` characters are used in\nthe [setext heading underline], and a level 2 heading if `-`" +- "\ncharacters are used. The contents of the heading are the result\nof parsing the preceding lines of text as CommonMark inline\ncontent.\n\n" +- "In general, a setext heading need not be preceded or followed by a\nblank line. However, it cannot interrupt a paragraph, so when a\nsetext heading comes after a paragraph, a blank line is needed between\nthem.\n\nSimple examples:\n" - "\n```````````````````````````````` example\nFoo *bar*\n=========\n\nFoo *bar*\n---------\n.\n

    Foo bar

    \n

    Foo bar

    \n````````````````````````````````\n\n\nThe content of the header may span more than one line:\n" - "\n```````````````````````````````` example\nFoo *bar\nbaz*\n====\n.\n

    Foo bar\nbaz

    \n````````````````````````````````" - "\n\nThe contents are the result of parsing the headings's raw\ncontent as inlines. The heading's raw content is formed by\nconcatenating the lines and removing initial and final\nspaces or tabs.\n" @@ -213,8 +220,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\nFoo\nbar\n* * *\nbaz\n.\n

    Foo\nbar

    \n
    \n

    baz

    \n````````````````````````````````\n\n\nAuthors who want interpretation 3 can use backslash escapes:\n" - "\n```````````````````````````````` example\nFoo\nbar\n\\---\nbaz\n.\n

    Foo\nbar\n---\nbaz

    \n````````````````````````````````\n\n\n" - "## Indented code blocks\n\n" -- "An [indented code block](@) is composed of one or more\n[indented chunks] separated by blank lines.\nAn [indented chunk](@) is a sequence of non-blank lines,\neach preceded by four or more spaces of indentation. The contents of the code\n" -- "block are the literal contents of the lines, including trailing\n[line endings], minus four spaces of indentation.\nAn indented code block has no [info string].\n\n" +- "An [indented code block](@) is composed of one or more\n[indented chunks] separated by blank lines.\nAn [indented chunk](@)" +- " is a sequence of non-blank lines,\neach preceded by four or more spaces of indentation. The contents of the code\nblock are the literal contents of the lines, including trailing\n[line endings], minus four spaces of indentation.\n" +- "An indented code block has no [info string].\n\n" - "An indented code block cannot interrupt a paragraph, so there must be\na blank line between a paragraph and a following indented code block.\n(A blank line is not needed, however, between a code block and a following\nparagraph.)\n" - "\n```````````````````````````````` example\n a simple\n indented code block\n.\n
    a simple\n  indented code block\n
    \n````````````````````````````````" - "\n\n\nIf there is any ambiguity between an interpretation of indentation\nas a code block and as indicating that material belongs to a [list\nitem][list items], the list item interpretation takes precedence:\n" @@ -233,18 +241,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nTrailing spaces or tabs are included in the code block's content:\n\n```````````````````````````````` example\n foo \n.\n
    foo  \n
    \n````````````````````````````````\n\n\n\n" - "## Fenced code blocks\n\n" - "A [code fence](@) is a sequence\nof at least three consecutive backtick characters (`` ` ``) or\ntildes (`~`). (Tildes and backticks cannot be mixed.)\nA [fenced code block](@)\nbegins with a code fence, preceded by up to three spaces of indentation.\n" -- "\nThe line with the opening code fence may optionally contain some text\nfollowing the code fence; this is trimmed of leading and trailing\nspaces or tabs and called the [info string](@). If the [info string] comes\n" -- "after a backtick fence, it may not contain any backtick\ncharacters. (The reason for this restriction is that otherwise\nsome inline code would be incorrectly interpreted as the\nbeginning of a fenced code block.)\n\n" -- "The content of the code block consists of all subsequent lines, until\na closing [code fence] of the same type as the code block\nbegan with (backticks or tildes), and with at least as many backticks\nor tildes as the opening code fence. " -- "If the leading code fence is\npreceded by N spaces of indentation, then up to N spaces of indentation are\nremoved from each line of the content (if present). (If a content line is not\n" +- "\nThe line with the opening code fence may optionally contain some text\nfollowing the code fence; this is trimmed of leading and trailing\nspaces or tabs and called the [info string](@)" +- ". If the [info string] comes\nafter a backtick fence, it may not contain any backtick\ncharacters. (The reason for this restriction is that otherwise\nsome inline code would be incorrectly interpreted as the\nbeginning of a fenced code block.)\n\n" +- "The content of the code block consists of all subsequent lines, until\na closing [code fence] of the same type as the code block\nbegan with (backticks or tildes), and with at least as many backticks\n" +- "or tildes as the opening code fence. If the leading code fence is\npreceded by N spaces of indentation, then up to N spaces of indentation are\nremoved from each line of the content (if present). (If a content line is not\n" - "indented, it is preserved unchanged. If it is indented N spaces or less, all\nof the indentation is removed.)\n\n" - "The closing code fence may be preceded by up to three spaces of indentation, and\nmay be followed only by spaces or tabs, which are ignored. If the end of the\ncontaining block (or document) is reached and no closing code fence\n" - "has been found, the code block contains all of the lines after the\nopening code fence until the end of the containing block (or\ndocument). (An alternative spec would require backtracking in the\n" - "event that a closing code fence is not found. But this makes parsing\nmuch less efficient, and there seems to be no real downside to the\nbehavior described here.)\n\n" - "A fenced code block may interrupt a paragraph, and does not require\na blank line either before or after.\n" -- "\nThe content of a code fence is treated as literal text, not parsed\nas inlines. The first word of the [info string] is typically used to\nspecify the language of the code sample, and rendered in the `class`\nattribute of the `code` tag. " -- "However, this spec does not mandate any\nparticular treatment of the [info string].\n\nHere is a simple example with backticks:\n\n```````````````````````````````` example\n```\n<\n >\n```\n.\n
    <\n >\n
    \n````````````````````````````````" -- "\n\n\nWith tildes:\n\n```````````````````````````````` example\n~~~\n<\n >\n~~~\n.\n
    <\n >\n
    \n````````````````````````````````\n\nFewer than three backticks is not enough:\n" +- "\nThe content of a code fence is treated as literal text, not parsed\nas inlines. The first word of the [info string] is typically used to\nspecify the language of the code sample, and rendered in the `class`\nattribute of the `code`" +- " tag. However, this spec does not mandate any\nparticular treatment of the [info string].\n\nHere is a simple example with backticks:\n" +- "\n```````````````````````````````` example\n```\n<\n >\n```\n.\n
    <\n >\n
    \n````````````````````````````````\n\n\nWith tildes:\n" +- "\n```````````````````````````````` example\n~~~\n<\n >\n~~~\n.\n
    <\n >\n
    \n````````````````````````````````\n\nFewer than three backticks is not enough:\n" - "\n```````````````````````````````` example\n``\nfoo\n``\n.\n

    foo

    \n````````````````````````````````\n\nThe closing code fence must use the same character as the opening\nfence:\n" - "\n```````````````````````````````` example\n```\naaa\n~~~\n```\n.\n
    aaa\n~~~\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n~~~\naaa\n```\n~~~\n.\n
    aaa\n```\n
    \n````````````````````````````````" - "\n\n\nThe closing code fence must be at least as long as the opening fence:\n\n```````````````````````````````` example\n````\naaa\n```\n``````\n.\n
    aaa\n```\n
    \n````````````````````````````````" @@ -274,21 +283,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
    foo\n
    \n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:\n" - "\n```````````````````````````````` example\n```\n``` aaa\n```\n.\n
    ``` aaa\n
    \n````````````````````````````````\n\n\n\n" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n" -- "\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\n" -- "It ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks)" +- "\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@)" +- " (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks)" - " containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line.\n\n" -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n" -- "`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n" +- "1. " +- "**Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, " +- "``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n" - "3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n" - "5. **Start condition:** line begins with the string\n``.\n\n" -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" - " line is followed by a [blank line].\n\n" -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n" +- "7. " +- "**Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n" - "**End condition:** line is followed by a [blank line].\n\n" -- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\n" -- "block** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\n" +- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock**" +- " that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\n" - "For instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n"
     - "\n```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n" @@ -311,14 +323,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\nIn type 7 blocks, the [tag name] can be anything:\n" - "\n```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\n```````````````````````````````` example\n\n*bar*\n.\n\n*bar*\n````````````````````````````````" -- "\n\n\nThese rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n`` tags in three different ways. In this case, we get a raw\n" -- "HTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" +- "\n\n\nThese rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n``" +- " tags in three different ways. In this case, we get a raw\nHTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" - "\n\n\nIn this case, we get a raw HTML block that just includes\nthe `` tag (because it ends with the following blank\nline). So the contents get interpreted as CommonMark:\n" - "\n```````````````````````````````` example\n\n\n*foo*\n\n\n.\n\n

    foo

    \n
    \n````````````````````````````````" - "\n\n\nFinally, in this case, the `` tags are interpreted\nas [raw HTML] *inside* the CommonMark paragraph. (Because\nthe tag is not on a line by itself, we get inline HTML\nrather than an [HTML block].)\n" - "\n```````````````````````````````` example\n*foo*\n.\n

    foo

    \n````````````````````````````````" -- "\n\n\nHTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks\n" -- "end at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\nA pre tag (type 1):\n" +- "\n\n\nHTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks\nend at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\n" +- "A pre tag (type 1):\n" - "\n```````````````````````````````` example\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\n"
     - "main = print $ parseTags tags\n
    \n

    okay

    \n````````````````````````````````\n\n\nA script tag (type 1):\n" - "\n```````````````````````````````` example\n\nokay\n.\n`, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``." +- "1." +- "**Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``," +- "``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``." - "3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``." - "5. **Start condition:** line begins with the string\n``." -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" - "line is followed by a [blank line]." -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\" +- "7." +- "**Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\" - "**End condition:** line is followed by a [blank line]." -- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML" -- "block** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state." +- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock**" +- " that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state." - "For instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:"
     - "```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "In this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following." @@ -307,14 +317,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\nIn type 7 blocks, the [tag name] can be anything:" - "```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````" - "```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\n```````````````````````````````` example\n\n*bar*\n.\n\n*bar*\n````````````````````````````````" -- "These rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n`` tags in three different ways. In this case, we get a raw" -- "HTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" +- "These rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n``" +- " tags in three different ways. In this case, we get a raw\nHTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" - "In this case, we get a raw HTML block that just includes\nthe `` tag (because it ends with the following blank\nline). So the contents get interpreted as CommonMark:" - "```````````````````````````````` example\n\n\n*foo*\n\n\n.\n\n

    foo

    \n
    \n````````````````````````````````" - "Finally, in this case, the `` tags are interpreted\nas [raw HTML] *inside* the CommonMark paragraph. (Because\nthe tag is not on a line by itself, we get inline HTML\nrather than an [HTML block].)" - "```````````````````````````````` example\n*foo*\n.\n

    foo

    \n````````````````````````````````" -- "HTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks" -- "end at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\nA pre tag (type 1):" +- "HTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks\nend at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:" +- "A pre tag (type 1):" - "```````````````````````````````` example\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()"
     - "main = print $ parseTags tags\n
    \n

    okay

    \n````````````````````````````````\n\n\nA script tag (type 1):" - "```````````````````````````````` example\n\nokay\n.\n`, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``.\n\n6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n" -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``.\n\n" +- "6. " +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" +- " line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n\n```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n\nAll types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
    \n hi\n
    \n\nokay.\n.\n\n \n \n \n
    \n hi\n
    \n

    okay.

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n \n*foo*\n````````````````````````````````\n\n\nHere we have two HTML blocks with a Markdown paragraph between them:\n\n```````````````````````````````` example\n
    \n\n*Markdown*\n\n
    \n.\n
    \n

    Markdown

    \n
    \n````````````````````````````````\n\n\nThe tag on the first line can be partial, as long\nas it is split where there would be whitespace:\n" @@ -138,8 +140,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n> > > foo\nbar\n.\n
    \n
    \n
    \n

    foo\nbar

    \n
    \n
    \n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
    \n
    \n
    \n

    foo\nbar\nbaz

    \n
    \n
    \n
    \n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:\n" - "\n```````````````````````````````` example\n> code\n\n> not code\n.\n
    \n
    code\n
    \n
    \n
    \n

    not code

    \n
    \n````````````````````````````````\n\n\n\n" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:\n\n" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n" -- " the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n" +- "\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n````````````````````````````````" - "\n\n\nThe most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n\nHere are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • one
    • \n
    \n
     two\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````\n\n\nIt is tempting to think of this in terms of columns: the continuation\nblocks must be indented at least to the column of the first character other than\na space or tab after the list marker. However, that is not quite right.\nThe spaces of indentation after the list marker determine how much relative\nindentation is needed. Which column this indentation reaches will depend on\nhow the list item is embedded in other constructions, as shown by\nthis example:\n" @@ -203,25 +205,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nFirst, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by\na non-backslash-escaped `_` character.\n\nA [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n" - "\nA [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n" +- "6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n" - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. " -- "So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*
    foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" -- "\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n" -- "\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````\n\n\nRule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:\n" -- "\n```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:\n" -- "\n```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:\n" -- "\n```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n" -- "\n```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:\n" -- "\n```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````" -- "\n\n\n```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````" -- "\n\n\nThis is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:\n\n```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n" -- "\n```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````" -- "\n\n\n```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````\n\n\nThis is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n" -- "\n```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith these examples:\n" -- "\n```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:\n" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n" +- "17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\n" +- "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n" +- "\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````" +- "\n\n\nRule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````" +- "\n\n\n```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:\n\n```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n" +- "\n```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````" +- "\n\n\nThis is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n\n```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:\n" +- "\n```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````" +- "\n\n\n```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````" +- "\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:\n" +- "\n```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````" +- "\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````" +- "\n\n\nThis is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````" +- "\n\n\nThe point of this restriction is more easily appreciated\nwith these examples:\n\n```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:\n" - "\n```````````````````````````````` example\n**foo**bar\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 8:\n\nThis is not strong emphasis, because the closing delimiter is\npreceded by whitespace:\n\n```````````````````````````````` example\n__foo bar __\n.\n

    __foo bar __

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the second `__` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n__(__foo)\n.\n

    __(__foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n" - "\n```````````````````````````````` example\n_(__foo__)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\n__foo__bar\n.\n

    __foo__bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__пристаням__стремятся\n.\n

    __пристаням__стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo__bar__baz__\n.\n

    foo__bar__baz

    \n````````````````````````````````" - "\n\n\nThis is strong emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n__(bar)__.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 9:\n\nAny nonempty sequence of inline elements can be the contents of an\nemphasized span.\n\n```````````````````````````````` example\n*foo [bar](/url)*\n.\n

    foo bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n*foo\nbar*\n.\n

    foo\nbar

    \n````````````````````````````````\n\n\nIn particular, emphasis and strong emphasis can be nested\ninside emphasis:\n" @@ -331,5 +333,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter.\n\n" - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n" +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:\n\n " +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap index 24a1460..f5d8a5e 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap @@ -28,12 +28,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "articles, slide shows, letters, and lecture notes.\n\n" - "What distinguishes Markdown from many other lightweight markup\n" - "syntaxes, which are often easier to write, is its readability.\n" -- "As Gruber writes:\n\n" -- "> The overriding design goal for Markdown's formatting syntax is\n" -- "> to make it as readable as possible. The idea is that a" -- "\n> Markdown-formatted document should be publishable as-is, as" -- "\n> plain text, without looking like it's been marked up with tags" -- "\n> or formatting instructions.\n> (" +- "As Gruber writes:" +- "\n\n> The overriding design goal for Markdown's formatting syntax is\n> " +- "to make it as readable as possible. The idea is that a\n> " +- "Markdown-formatted document should be publishable as-is, as\n> " +- "plain text, without looking like it's been marked up with tags\n> " +- "or formatting instructions.\n> (" - ")\n\n" - "The point can be illustrated by comparing a sample of\n" - "[AsciiDoc](https://asciidoc.org/)" @@ -60,85 +60,84 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " This paragraph is part of the preceding list item.\n\n 2. " - "List item b.\n\n This paragraph belongs to item two of the outer list.\n" - "```\n\n" -- "The AsciiDoc version is, arguably, easier to write. " -- "You don't need\n" -- "to worry about indentation. But the Markdown version is much easier\n" +- "The AsciiDoc version is, arguably, easier to write. You don'" +- "t need\nto worry about indentation. But the Markdown version is much easier" +- "\n" - to read. The nesting of list items is apparent to the eye in the - "\nsource, not just in the processed document.\n\n" - "## Why is a spec needed?\n\n" -- "John Gruber's [canonical description of Markdown's\n" -- "syntax](https://daringfireball.net/projects/markdown/syntax)\n" +- "John Gruber's " +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown/syntax)\n" - does not specify the syntax unambiguously. Here are some examples of -- "\nquestions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? " -- "The spec says that\n " +- "\nquestions it does not answer:" +- "\n\n1. " +- "How much indentation is needed for a sublist? The spec says that\n " - "continuation paragraphs need to be indented four spaces, but is\n " - "not fully explicit about sublists. It is natural to think that\n " - "they, too, must be indented four spaces, but `Markdown.pl`" - " does\n not require that. This is hardly a \"corner case,\"" - " and divergences\n between implementations on this issue often lead to surprises for" - "\n users in real documents. (See " -- "[this comment by John\n" -- " Gruber](https://web.archive.org/web/20170611172104" -- "/http://article.gmane.org/gmane.text.markdown.general/" -- "1997).)\n\n" -- "2. Is a blank line needed before a block quote or heading?\n" -- " Most implementations do not require the blank line. However,\n " +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/20170611172104/http://" +- article.gmane.org/gmane.text.markdown.general/1997). +- ")\n\n2. Is a blank line needed before a block quote or heading?" +- "\n Most implementations do not require the blank line. However,\n " - "this can lead to unexpected results in hard-wrapped text, and\n " - "also to ambiguities in parsing (note that some implementations\n " - "put the heading inside the blockquote, while others do not).\n " - "(John Gruber has also spoken " -- "[in favor of requiring the blank\n" -- " lines](https://web.archive.org/web/20170611172104/http" -- "://article.gmane.org/gmane.text.markdown.general/2146" -- ").)\n\n" -- "3. Is a blank line needed before an indented code block?\n" -- " (`Markdown.pl`" +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/20170611172104/http://" +- article.gmane.org/gmane.text.markdown.general/2146). +- ")\n\n3. Is a blank line needed before an indented code block?" +- "\n (`Markdown.pl`" - " requires it, but this is not mentioned in the\n " - "documentation, and some implementations do not require it.)\n\n " - "``` markdown\n paragraph\n code?\n ```\n\n" -- "4. What is the exact rule for determining when list items get\n" -- " wrapped in `

    `" +- "4. What is the exact rule for determining when list items get\n " +- "wrapped in `

    `" - " tags? Can a list be partially \"loose\" and partially\n " - "\"tight\"? What should we do with a list like this?\n\n " - "``` markdown\n 1. one\n\n 2. two\n" - " 3. three\n ```\n\n Or this?\n" - "\n ``` markdown\n 1. one\n - a\n\n" - " - b\n 2. two\n ```\n\n " -- "(There are some relevant comments by John Gruber\n" -- " [here](https://web.archive.org/web/20170611172104" -- "/http://article.gmane.org/gmane.text.markdown.general/" -- "2554).)\n\n" +- "(There are some relevant comments by John Gruber\n " +- "[here](https://web.archive.org/web/20170611172104/http" +- "://article.gmane.org/gmane.text.markdown.general/2554" +- ").)\n\n" - "5. Can list markers be indented? " - "Can ordered list markers be right-aligned?\n\n " - "``` markdown\n 8. item 1\n" - " 9. item 2\n " - "10. item 2a\n ```\n\n" -- "6. Is this one list with a thematic break in its second item,\n" -- " or two lists separated by a thematic break?\n\n " +- "6. Is this one list with a thematic break in its second item," +- "\n or two lists separated by a thematic break?\n\n " - "``` markdown\n * a\n * * * * *\n" - " * b\n ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n" -- " two lists or one? (The Markdown syntax description suggests two,\n " -- "but the perl scripts and many other implementations produce one.)\n\n " +- "7. When list markers change from numbers to bullets, do we have" +- "\n two lists or one? (The Markdown syntax description suggests two," +- "\n but the perl scripts and many other implementations produce one.)\n\n " - "``` markdown\n 1. fee\n 2. fie\n" - " - foe\n - fum\n ```\n\n" -- "8. What are the precedence rules for the markers of inline structure?\n" -- " For example, is the following a valid link, or does the code span" +- "8. What are the precedence rules for the markers of inline structure?\n " +- "For example, is the following a valid link, or does the code span" - "\n take precedence ?\n\n " - "``` markdown\n" - " [a backtick (`)](/url) and [another backtick (`" - ")](/url).\n ```\n\n" -- "9. What are the precedence rules for markers of emphasis and strong\n" -- " emphasis? For example, how should the following be parsed?\n\n " +- "9. What are the precedence rules for markers of emphasis and strong\n " +- "emphasis? For example, how should the following be parsed?\n\n " - "``` markdown\n *foo *bar* baz*\n ```\n\n" -- "10. What are the precedence rules between block-level and inline-level\n" -- " structure? For example, how should the following be parsed?\n\n " +- "10. What are the precedence rules between block-level and inline-level\n " +- "structure? For example, how should the following be parsed?\n\n " - "``` markdown\n" - " - `a long code span can contain a hyphen like this\n " - " - and it can screw things up`\n ```\n\n" -- "11. Can list items include section headings? " -- "(`Markdown.pl`" +- "11. " +- "Can list items include section headings? (`Markdown.pl`" - " does not\n allow this, but does allow blockquotes to include headings.)" - "\n\n ``` markdown\n - # Heading\n ```\n\n" - "12. Can list items be empty?\n\n ``` markdown\n * a\n" @@ -146,12 +145,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "13. Can link references be defined inside block quotes or list items?\n\n" - " ``` markdown\n > Blockquote [foo].\n >\n" - " > [foo]: /url\n ```\n\n" -- "14. If there are multiple definitions for the same reference, which takes\n" -- " precedence?\n\n " +- "14. If there are multiple definitions for the same reference, which takes\n " +- "precedence?\n\n " - "``` markdown\n [foo]: /url1\n" - " [foo]: /url2\n\n [foo][]\n ```\n\n" -- "In the absence of a spec, early implementers consulted `Markdown.pl`\n" -- "to resolve these ambiguities. But `Markdown.pl`" +- "In the absence of a spec, early implementers consulted `Markdown.pl`" +- "\nto resolve these ambiguities. But `Markdown.pl`" - " was quite buggy, and\n" - "gave manifestly bad results in many cases, so it was not a\n" - "satisfactory replacement for a spec.\n\n" @@ -182,52 +181,53 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the expectations of the spec examples (percent-encoding\n" - "non-ASCII characters in URLs). But a conforming implementation\n" - "can use a different renderer and may choose not to\n" -- "percent-encode non-ASCII characters in URLs.\n\n" -- "This document is generated from a text file, `spec.txt`, written\n" -- "in Markdown with a small extension for the side-by-side tests.\nThe script " -- "`tools/makespec.py` can be used to convert `spec.txt`" -- " into\nHTML or CommonMark (which can then be converted into other formats).\n\n" -- "In the examples, the `→` character is used to represent tabs.\n\n" +- percent-encode non-ASCII characters in URLs. +- "\n\nThis document is generated from a text file, `spec.txt`" +- ", written\nin Markdown with a small extension for the side-by-side tests.\n" +- "The script `tools/makespec.py` can be used to convert `" +- "spec.txt` into\n" +- HTML or CommonMark (which can then be converted into other formats). +- "\n\nIn the examples, the `→` character is used to represent tabs.\n\n" - "# Preliminaries\n\n" - "## Characters and lines\n\n" - "Any sequence of [characters] is a valid CommonMark\ndocument.\n" -- "\nA [character](@) is a Unicode code point. " -- "Although some\ncode points (for example, combining accents) do not correspond to" -- "\ncharacters in an intuitive sense, all code points count as characters\n" +- "\nA [character](@)" +- " is a Unicode code point. Although some\n" +- "code points (for example, combining accents) do not correspond to\n" +- "characters in an intuitive sense, all code points count as characters\n" - "for purposes of this spec.\n\n" - "This spec does not specify an encoding; it thinks of lines as composed\n" - "of [characters] rather than bytes. A conforming parser may be limited" - "\nto a certain encoding.\n\n" -- "A [line](@) is a sequence of zero or more [characters]\n" -- "other than line feed (`U+000A`) or carriage return (`U+" -- "000D`),\nfollowed by a [line ending]" -- " or by the end of file.\n\n" -- "A [line ending](@) is a line feed (`U+000A" -- "`), a carriage return\n(`U+000D`" -- ") not followed by a line feed, or a carriage return and a\n" -- "following line feed.\n\n" -- "A line containing no characters, or a line containing only spaces\n" -- "(`U+0020`) or tabs (`U+0009`), is" +- "A [line](@)" +- " is a sequence of zero or more [characters]\nother than line feed (" +- "`U+000A`) or carriage return (`U+000D`" +- "),\nfollowed by a [line ending] or by the end of file." +- "\n\nA [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`) not followed by a line feed, or a carriage" +- " return and a\nfollowing line feed." +- "\n\nA line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (`U+0009`), is" - " called a [blank line](@).\n\n" - "The following definitions of character classes will be used in this spec:\n" -- "\n" -- "A [Unicode whitespace character](@) is a character in the Unicode `" -- "Zs` general\ncategory, or a tab (`U+0009`" -- "), line feed (`U+000A`), form feed (`U+" -- "000C`), or\ncarriage return (`U+000D`).\n\n" -- "[Unicode whitespace](@) is a sequence of one or more\n" -- "[Unicode whitespace characters].\n\n" +- "\nA [Unicode whitespace character](@) is a character in the Unicode " +- "`Zs` general\ncategory, or a tab (" +- "`U+0009`), line feed (`U+000A`)," +- " form feed (`U+000C`), or\ncarriage return (" +- "`U+000D`).\n\n" +- "[Unicode whitespace](@)" +- " is a sequence of one or more\n[Unicode whitespace characters].\n\n" - "A [tab](@) is `U+0009`.\n" - "\nA [space](@) is `U+0020`.\n" -- "\n" -- "An [ASCII control character](@) is a character between `U+" -- "0000–1F` (both\nincluding) or " +- "\nAn [ASCII control character](@) is a character between " +- "`U+0000–1F` (both\nincluding) or " - "`U+007F`.\n\n" -- "An [ASCII punctuation character](@)\nis `!" -- "`, `\"`, `#`, `$`, `%`, `&`, `'`, `(" -- "`, `)`,\n`*`, `+`, `,`, `-`, `.`" -- ", `/` (U+0021–2F), \n`:`, " -- "`;`, `<`, `=`, `>`, `?`, `@`" +- "An [ASCII punctuation character](@)\nis `!`, `\"`, " +- "`#`, `$`, `%`, `&`, `'`, `(`, `" +- ")`,\n`*`, `+`, `,`, `-`, `.`, `/`" +- " (U+0021–2F), \n`:`, `;`, " +- "`<`, `=`, `>`, `?`, `@`" - " (U+003A–0040),\n`[`, `\\`, " - "`]`, `^`, `_`, `` ` `` (U+005B–" - "0060), \n`{`, `|`, `}`, or `~`" @@ -278,7 +278,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n
      bar\n"
     - "
    \n\n\n" - "````````````````````````````````\n" -- "\nNormally the `>` that begins a block quote may be followed\n" +- "\nNormally the `>`" +- " that begins a block quote may be followed\n" - "optionally by a space, which is not considered part of the\n" - "content. In the following case `>`" - " is followed by a tab,\n" @@ -427,10 +428,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Entity and numeric character references\n\n" - "Valid HTML entity references and numeric character references\n" - "can be used in place of the corresponding Unicode character,\nwith the following exceptions:\n\n" -- "- Entity and character references are not recognized in code\n" -- " blocks and code spans.\n\n" -- "- Entity and character references cannot stand in place of\n" -- " special characters that define structural elements in\n " +- "- Entity and character references are not recognized in code\n " +- blocks and code spans. +- "\n\n- Entity and character references cannot stand in place of\n " +- "special characters that define structural elements in\n " - "CommonMark. For example, although `*`" - " can be used\n in place of a literal `*` character, " - "`*` cannot replace\n `*`" @@ -452,9 +453,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

      & © Æ Ď\n" - "¾ ℋ ⅆ\n∲ ≧̸

    \n" - "````````````````````````````````\n" -- "\n\n[Decimal numeric character\nreferences](@)\n" -- "consist of `&#` + a string of 1--7 arabic" -- " digits + `;`" +- "\n\n[Decimal numeric character\nreferences](@)\nconsist of `&#`" +- " + a string of 1--7 arabic digits + `;`" - ". A\nnumeric character reference is parsed as the corresponding\n" - "Unicode character. Invalid Unicode code points will be replaced by\n" - "the REPLACEMENT CHARACTER (`U+FFFD`" @@ -465,10 +465,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Ӓ Ϡ �\n.\n" - "

    # Ӓ Ϡ �

    \n" - "````````````````````````````````\n" -- "\n\n[Hexadecimal numeric character\n" -- "references](@) consist of `&#` +\neither `X` or " -- "`x` + a string of 1-6 hexadecimal digits + `;" -- "`.\nThey too are parsed as the corresponding Unicode character (this\n" +- "\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#`" +- " +\neither `X` or `x`" +- " + a string of 1-6 hexadecimal digits + `;`" +- ".\nThey too are parsed as the corresponding Unicode character (this\n" - "time specified with a hexadecimal numeral instead of decimal).\n\n" - "````````````````````````````````" - " example\n" @@ -576,8 +576,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "# Blocks and inlines\n\n" -- "We can think of a document as a sequence of\n" -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block\n" - "quotations, lists, headings, rules, and code blocks. " - "Some blocks (like\n" @@ -586,17 +585,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " content---text,\n" - "links, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\n" -- "Indicators of block structure always take precedence over indicators\nof inline structure. " -- "So, for example, the following is a list with\n" -- "two items, not a list with one item containing a code span:\n\n" +- "Indicators of block structure always take precedence over indicators\n" +- "of inline structure. So, for example, the following is a list with" +- "\ntwo items, not a list with one item containing a code span:\n\n" - "````````````````````````````````" - " example\n" - "- `one\n- two`\n.\n
      \n
    • `one
    • \n" - "
    • two`
    • \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "This means that parsing can proceed in two steps: first, the block\n" -- "structure of the document can be discerned; second, text lines inside\n" +- "\n\nThis means that parsing can proceed in two steps: first, the block" +- "\nstructure of the document can be discerned; second, text lines inside\n" - "paragraphs, headings, and other block constructs can be parsed for inline\n" - "structure. The second step requires information about link reference\n" - "definitions that will be available only at the end of the first\n" @@ -726,10 +724,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n" - "## ATX headings\n\n" -- "An [ATX heading](@)\n" -- "consists of a string of characters, parsed as inline content, between an\n" -- "opening sequence of 1--6 unescaped `#` characters and an optional" -- "\nclosing sequence of any number of unescaped `#`" +- "An [ATX heading](@)" +- "\nconsists of a string of characters, parsed as inline content, between an" +- "\nopening sequence of 1--6 unescaped `#`" +- " characters and an optional\nclosing sequence of any number of unescaped `#`" - " characters.\nThe opening sequence of `#`" - " characters must be followed by spaces or tabs, or\n" - "by the end of line. The optional closing sequence of `#`" @@ -754,9 +752,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n####### foo\n.\n

    ####### foo

    \n" - "````````````````````````````````\n" -- "\n\n" -- "At least one space or tab is required between the `#` characters and the" -- "\nheading's contents, unless the heading is empty. Note that many\n" +- "\n\nAt least one space or tab is required between the `#`" +- " characters and the\nheading'" +- "s contents, unless the heading is empty. Note that many\n" - "implementations currently do not require the space. However, the\n" - "space was required by the\n" - "[original ATX implementation](http://www.aaronsw.com/2002" @@ -823,8 +821,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n### foo ### \n.\n

    foo

    \n" - "````````````````````````````````\n" -- "\n\n" -- "A sequence of `#` characters with anything but spaces or tabs following it\n" +- "\n\nA sequence of `#`" +- " characters with anything but spaces or tabs following it\n" - "is not a closing sequence, but counts as part of the contents of the\n" - "heading:\n\n" - "````````````````````````````````" @@ -836,8 +834,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n# foo#\n.\n

    foo#

    \n" - "````````````````````````````````\n" -- "\n\nBackslash-escaped `#` characters do not count as part\n" -- "of the closing sequence:\n\n" +- "\n\nBackslash-escaped `#`" +- " characters do not count as part\nof the closing sequence:\n\n" - "````````````````````````````````" - " example\n" - "### foo \\###\n## foo #\\##\n# foo \\#\n.\n" @@ -868,7 +866,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n" - "## Setext headings\n\n" -- "A [setext heading](@) consists of one or more\n" +- "A [setext heading](@)" +- " consists of one or more\n" - "lines of text, not interrupted by a blank line, of which the first line" - " does not\nhave more than 3 spaces of indentation, followed by\n" - "a [setext heading underline]. The lines of text must be such\n" @@ -877,11 +876,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "interpretable as a [code fence], [ATX heading][ATX headings" - "],\n[block quote][block quotes], [thematic break][thematic breaks],\n" - "[list item][list items], or [HTML block][HTML blocks].\n\n" -- "A [setext heading underline](@) is a sequence of\n" -- "`=` characters or a sequence of `-` characters, with no more than " -- "3\nspaces of indentation and any number of trailing spaces or tabs.\n\n" -- "The heading is a level 1 heading if `=` characters are used in\n" -- "the [setext heading underline], and a level 2 heading if `-`" +- "A [setext heading underline](@) is a sequence of\n`=`" +- " characters or a sequence of `-`" +- " characters, with no more than 3\n" +- spaces of indentation and any number of trailing spaces or tabs. +- "\n\nThe heading is a level 1 heading if `=`" +- " characters are used in\nthe [setext heading underline]" +- ", and a level 2 heading if `-`" - "\ncharacters are used. The contents of the heading are the result\n" - "of parsing the preceding lines of text as CommonMark inline\ncontent.\n\n" - "In general, a setext heading need not be preceded or followed by a\n" @@ -917,9 +918,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Foo\n-------------------------\n\nFoo\n=\n.\n

    Foo

    \n" - "

    Foo

    \n" - "````````````````````````````````\n" -- "\n\n" -- "The heading content can be preceded by up to three spaces of indentation, and\n" -- "need not line up with the underlining:\n\n" +- "\n\nThe heading content can be preceded by up to three spaces of indentation, and" +- "\nneed not line up with the underlining:\n\n" - "````````````````````````````````" - " example\n" - " Foo\n---\n\n Foo\n-----\n\n Foo\n ===\n.\n" @@ -970,8 +970,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    <a title="a lot

    \n" - "

    of dashes"/>

    \n" - "````````````````````````````````\n" -- "\n\nThe setext heading underline cannot be a [lazy continuation\n" -- "line] in a list item or block quote:\n\n" +- "\n\nThe setext heading underline cannot be a [lazy continuation\nline]" +- " in a list item or block quote:\n\n" - "````````````````````````````````" - " example\n" - "> Foo\n---\n.\n
    \n

    Foo

    \n" @@ -1033,14 +1033,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n-----\n.\n
    \n

    foo

    \n" - "
    \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "If you want a heading with `> foo` as its literal text, you" -- " can\nuse backslash escapes:\n\n" +- "\n\nIf you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:\n\n" - "````````````````````````````````" - " example\n" - "\\> foo\n------\n.\n

    > foo

    \n" - "````````````````````````````````\n" -- "\n\n**Compatibility note:** Most existing Markdown implementations\n" +- "\n\n**Compatibility note:**" +- " Most existing Markdown implementations\n" - "do not allow the text of setext headings to span multiple lines.\n" - "But there is no consensus about how to interpret\n\n" - "``` markdown\nFoo\nbar\n---\nbaz\n```" @@ -1081,12 +1081,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n" - "## Indented code blocks\n\n" -- "An [indented code block](@) is composed of one or more\n" -- "[indented chunks] separated by blank lines.\nAn " -- "[indented chunk](@)" -- " is a sequence of non-blank lines,\n" -- "each preceded by four or more spaces of indentation. The contents of the code\n" -- "block are the literal contents of the lines, including trailing\n[line endings]" +- "An [indented code block](@)" +- " is composed of one or more\n[indented chunks] separated by blank lines.\n" +- "An [indented chunk](@) is a sequence of non-blank lines" +- ",\neach preceded by four or more spaces of indentation. The contents of the code" +- "\nblock are the literal contents of the lines, including trailing\n[line endings]" - ", minus four spaces of indentation.\nAn indented code block has no [" - "info string].\n\n" - "An indented code block cannot interrupt a paragraph, so there must be\n" @@ -1115,9 +1114,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n
      \n
    • bar
    • \n" - "
    \n\n\n" - "````````````````````````````````\n" -- "\n\n\n" -- "The contents of a code block are literal text, and do not get parsed\n" -- "as Markdown:\n\n" +- "\n\n\nThe contents of a code block are literal text, and do not get parsed" +- "\nas Markdown:\n\n" - "````````````````````````````````" - " example\n" - " \n *hi*\n\n - one\n.\n" @@ -1144,9 +1142,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\nFoo\n bar\n\n.\n

    Foo\nbar

    \n" - "````````````````````````````````\n" -- "\n\n" -- "However, any non-blank line with fewer than four spaces of indentation ends\n" -- "the code block immediately. So a paragraph may occur immediately\n" +- "\n\nHowever, any non-blank line with fewer than four spaces of indentation ends" +- "\nthe code block immediately. So a paragraph may occur immediately\n" - "after indented code:\n\n" - "````````````````````````````````" - " example\n" @@ -1183,14 +1180,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "## Fenced code blocks\n\n" -- "A [code fence](@) is a sequence\n" -- "of at least three consecutive backtick characters (`` ` ``" +- "A [code fence](@)" +- " is a sequence\nof at least three consecutive backtick characters (`` ` ``" - ") or\ntildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA " - "[fenced code block](@)" - "\n" -- "begins with a code fence, preceded by up to three spaces of indentation.\n\n" -- "The line with the opening code fence may optionally contain some text\n" +- "begins with a code fence, preceded by up to three spaces of indentation." +- "\n\nThe line with the opening code fence may optionally contain some text\n" - "following the code fence; this is trimmed of leading and trailing\n" - "spaces or tabs and called the [info string](@)" - ". If the [info string] comes\n" @@ -1223,8 +1220,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "much less efficient, and there seems to be no real downside to the\n" - "behavior described here.)\n\n" - "A fenced code block may interrupt a paragraph, and does not require\n" -- "a blank line either before or after.\n\n" -- "The content of a code fence is treated as literal text, not parsed\n" +- a blank line either before or after. +- "\n\nThe content of a code fence is treated as literal text, not parsed\n" - "as inlines. The first word of the [info string]" - " is typically used to\n" - "specify the language of the code sample, and rendered in the `class`" @@ -1307,8 +1304,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n```\n```\n.\n
    \n" - "````````````````````````````````\n" -- "\n\nFences can be indented. " -- "If the opening fence is indented,\n" +- "\n\n" +- "Fences can be indented. If the opening fence is indented,\n" - "content lines will have equivalent opening indentation removed,\nif present:\n\n" - "````````````````````````````````" - " example\n" @@ -1334,9 +1331,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " ```\n aaa\n ```\n.\n
    ```\naaa\n"
     - "```\n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "Closing fences may be preceded by up to three spaces of indentation, and their\n" -- "indentation need not match that of the opening fence:\n\n" +- "\n\nClosing fences may be preceded by up to three spaces of indentation, and their" +- "\nindentation need not match that of the opening fence:\n\n" - "````````````````````````````````" - " example\n" - "```\naaa\n ```\n.\n
    aaa\n"
    @@ -1434,19 +1430,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````````````````````\n"
     - "\n\n\n"
     - "## HTML blocks\n\n"
    -- "An [HTML block](@) is a group of lines that is treated\n"
    -- "as raw HTML (and will not be escaped in HTML output).\n\n"
    -- "There are seven kinds of [HTML block], which can be defined by their\n"
    -- "start and end conditions.  The block begins with a line that meets a\n"
    -- "[start condition](@)"
    +- "An [HTML block](@)"
    +- " is a group of lines that is treated\n"
    +- as raw HTML (and will not be escaped in HTML output).
    +- "\n\nThere are seven kinds of [HTML block], which can be defined by their"
    +- "\nstart and end conditions.  The block begins with a line that meets a"
    +- "\n[start condition](@)"
     - " (after up to three optional spaces of indentation).\n"
     - "It ends with the first subsequent line that meets a matching\n"
     - "[end condition](@), or the last line of the document, or the last"
     - " line of\nthe [container block](#container-blocks)"
     - " containing the current HTML\nblock, if no line is encountered that meets the ["
     - "end condition].  If\nthe first line meets both the [start condition]"
    -- " and the [end\ncondition], the block will contain just that line.\n\n"
    -- "1.  **Start condition:**  line begins with the string ``"
     - ", or the end of the line.\\\n**End condition:**"
    @@ -1457,14 +1455,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "**End condition:**  line contains the string `-->`.\n\n"
     - "3.  **Start condition:** line begins with the string ``.\n\n"
    -- "4.  **Start condition:** line begins with the string ``.\n\n"
    -- "5.  **Start condition:**  line begins with the string\n``.\n\n"
    +- "5.  "
    +- "**Start condition:**  line begins with the string\n``.\n\n"
     - "6.  "
    -- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**"
     - " line is followed by a [blank line].\n\n"
     - "7.  "
    -- "**Start condition:**  line begins with a complete [open tag]\n"
    -- "(with any [tag name] other than `pre`, `script`,\n"
    -- "`style`, or `textarea`"
    +- "**Start condition:**"
    +- "  line begins with a complete [open tag]\n(with any [tag name]"
    +- " other than `pre`, `script`,\n`style`, or `textarea`"
     - ") or a complete [closing tag],\n"
     - "followed by zero or more spaces and tabs, followed by the end of the"
     - " line.\\\n**End condition:** line is followed by a [blank line].\n\n"
    -- "HTML blocks continue until they are closed by their appropriate\n"
    -- "[end condition], or the last line of the document or other "
    +- "HTML blocks continue until they are closed by their appropriate\n[end condition]"
    +- ", or the last line of the document or other "
     - "[container\nblock](#container-blocks).  This means any HTML "
     - "**within an HTML\nblock**"
     - " that might otherwise be recognised as a start condition will\n"
    @@ -1606,9 +1606,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    \n``` c\nint x = 33;\n```\n.\n" - "
    \n``` c\nint x = 33;\n```\n" - "````````````````````````````````\n" -- "\n\n" -- "To start an [HTML block] with a tag that is *not* in" -- " the\n" +- "\n\nTo start an [HTML block] with a tag that is *not*" +- " in the\n" - "list of block-level tags in (6), you must put the tag by\n" - "itself on the first line (and it must be complete):\n\n" - "````````````````````````````````" @@ -1653,8 +1652,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n*foo*\n\n\n.\n\n" - "

    foo

    \n
    \n" - "````````````````````````````````\n" -- "\n\nFinally, in this case, the `` tags are interpreted\n" -- "as [raw HTML] *inside*" +- "\n\nFinally, in this case, the ``" +- " tags are interpreted\nas [raw HTML] *inside*" - " the CommonMark paragraph. (Because\n" - "the tag is not on a line by itself, we get inline HTML\n" - "rather than an [HTML block].)\n\n" @@ -1663,9 +1662,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo*\n.\n" - "

    foo

    \n" - "````````````````````````````````\n" -- "\n\nHTML tags designed to contain literal content\n" -- "(`pre`, `script`, `style`, `textarea`), comments, processing" -- " instructions,\nand declarations are treated somewhat differently.\n" +- "\n\nHTML tags designed to contain literal content\n(`pre`, `script`, " +- "`style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently.\n" - "Instead of ending at the first blank line, these blocks\n" - "end at the first line containing a corresponding end tag.\n" - "As a result, these blocks can contain blank lines:\n\n" @@ -1793,9 +1792,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n\n
    \n.\n
    \n" - "
    <div>\n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "An HTML block of types 1--6 can interrupt a paragraph, and need" -- " not be\npreceded by a blank line.\n\n" +- "\n\nAn HTML block of types 1--" +- "6 can interrupt a paragraph, and need not be\n" +- "preceded by a blank line.\n\n" - "````````````````````````````````" - " example\n" - "Foo\n
    \nbar\n
    \n.\n

    Foo

    \n" @@ -1817,19 +1816,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \nbaz

    \n" - "````````````````````````````````\n" - "\n\nThis rule differs from John Gruber's original Markdown syntax\n" -- "specification, which says:\n\n" -- "> The only restrictions are that block-level HTML elements —\n" -- "> e.g. `
    `, ``, `
    `,"
    -- " `

    `" -- ", etc. — must be separated from\n> " +- "specification, which says:" +- "\n\n> The only restrictions are that block-level HTML elements —\n> " +- "e.g. `

    `, `
    `, `
    `, `<"
    +- "p>`, etc. — must be separated from\n> "
     - "surrounding content by blank lines, and the start and end tags of the"
     - "\n> block should not be indented with spaces or tabs.\n\n"
     - "In some ways Gruber's rule is more restrictive than the one given\n"
     - "here:\n\n"
    -- "- It requires that an HTML block be preceded by a blank line.\n"
    -- "- It does not allow the start tag to be indented.\n"
    -- "- It requires a matching end tag, which it also does not allow to\n"
    -- "  be indented.\n\n"
    +- "- It requires that an HTML block be preceded by a blank line."
    +- "\n- It does not allow the start tag to be indented.\n"
    +- "- It requires a matching end tag, which it also does not allow to"
    +- "\n  be indented.\n\n"
     - "Most Markdown implementations (including some of Gruber's own) do not\n"
     - "respect all of these restrictions.\n\n"
     - "There is one respect, however, in which Gruber's rule is more liberal"
    @@ -1858,8 +1856,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "`markdown=1`"
     - ".  The rule given above seems a simpler and\n"
     - "more elegant way of achieving the same expressive power, which is also\n"
    -- "much simpler to parse.\n\n"
    -- "The main potential drawback is that one can no longer paste HTML\n"
    +- much simpler to parse.
    +- "\n\nThe main potential drawback is that one can no longer paste HTML\n"
     - "blocks into Markdown documents with 100% reliability.  However,\n"
     - "*in most cases*"
     - " this will work fine, because the blank lines in\n"
    @@ -1882,14 +1880,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "</td>\n
    \n \n" - "
    \n" - "````````````````````````````````\n" -- "\n\nFortunately, blank lines are usually not necessary and can be\ndeleted. " -- "The exception is inside `
    `"
    +- "\n\nFortunately, blank lines are usually not necessary and can be\n"
    +- "deleted.  The exception is inside `
    `"
     - " tags, but as described\n[above][HTML blocks]"
     - ", raw HTML blocks starting with `
    `\n*can* contain blank lines.\n\n"
     - "## Link reference definitions\n\n"
    -- "A [link reference definition](@)\n"
    -- "consists of a [link label], optionally preceded by up to three spaces of"
    -- "\nindentation, followed\nby a colon (`:`"
    +- "A [link reference definition](@)"
    +- "\nconsists of a [link label]"
    +- ", optionally preceded by up to three spaces of\nindentation, followed\n"
    +- "by a colon (`:`"
     - "), optional spaces or tabs (including up to one\n[line ending]), a ["
     - "link destination],\noptional spaces or tabs (including up to one\n[line ending]"
     - "), and an optional [link\ntitle]"
    @@ -2183,31 +2182,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - " are meta-containers for [list items].\n\n"
     - "We define the syntax for container blocks recursively.  The general\n"
     - "form of the definition is:\n\n"
    -- "> If X is a sequence of blocks, then the result of\n"
    -- "> transforming X in such-and-such a way is a container of type Y"
    -- "\n> with these blocks as its content.\n\n"
    -- "So, we explain what counts as a block quote or list item by explaining\n"
    -- how these can be *generated*
    +- "> If X is a sequence of blocks, then the result of\n> "
    +- transforming X in such-and-such a way is a container of type Y
    +- "\n> with these blocks as its content."
    +- "\n\nSo, we explain what counts as a block quote or list item by explaining"
    +- "\nhow these can be *generated*"
     - " from their contents. This should suffice\n"
     - "to define the syntax, although it does not give a recipe for *parsing"
     - "*\nthese constructions.  (A recipe is provided below in the section entitled\n"
     - "[A parsing strategy](#appendix-a-parsing-strategy).)\n\n"
     - "## Block quotes\n\n"
    -- "A [block quote marker](@),\n"
    -- "optionally preceded by up to three spaces of indentation,\n"
    +- "A [block quote marker](@)"
    +- ",\noptionally preceded by up to three spaces of indentation,\n"
     - "consists of (a) the character `>` together with a following space of"
     - "\nindentation, or (b) a single character `>`"
     - " not followed by a space of\nindentation.\n\n"
     - "The following rules define [block quotes]:\n\n"
    -- 1.  **Basic case.
    -- "**  If a string of lines *Ls*"
    +- "1.  "
    +- "**Basic case.**  If a string of lines *Ls*"
     - " constitute a sequence\n    of blocks *Bs*"
     - ", then the result of prepending a [block quote\n    marker]"
     - " to the beginning of each line in *Ls*\n    is a "
     - "[block quote](#block-quotes) containing *Bs*.\n\n"
    -- 2.  **Laziness.
    -- "**  If a string of lines *Ls* constitute a "
    -- "[block\n    quote](#block-quotes) with contents *Bs*"
    +- "2.  "
    +- "**Laziness.**  If a string of lines *Ls*"
    +- " constitute a [block\n    quote](#block-quotes) with contents "
    +- "*Bs*"
     - ", then the result of deleting\n    the initial [block quote marker]"
     - " from one or\n    "
     - more lines in which the next character other than a space or tab after the
    @@ -2217,10 +2217,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - " is text\n    "
     - "that will be parsed as part of the content of a paragraph, but does"
     - "\n    not occur at the beginning of the paragraph.\n\n"
    -- 3.  **Consecutiveness.
    -- "**  A document cannot contain two [block\n    quotes]"
    -- " in a row unless there is a [blank line] between them.\n\n"
    -- "Nothing else counts as a [block quote](#block-quotes).\n"
    +- "3.  "
    +- "**Consecutiveness.**"
    +- "  A document cannot contain two [block\n    quotes]"
    +- " in a row unless there is a [blank line] between them."
    +- "\n\nNothing else counts as a [block quote](#block-quotes).\n"
     - "\nHere is a simple example:\n"
     - "\n"
     - "````````````````````````````````"
    @@ -2253,8 +2254,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    > # Foo\n> bar\n"
     - "> baz\n
    \n" - "````````````````````````````````\n" -- "\n\nThe Laziness clause allows us to omit the `>` before\n" -- "[paragraph continuation text]:\n\n" +- "\n\nThe Laziness clause allows us to omit the `>`" +- " before\n[paragraph continuation text]:\n\n" - "````````````````````````````````" - " example\n" - "> # Foo\n> bar\nbaz\n.\n
    \n" @@ -2288,9 +2289,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • foo
  • \n\n
    \n
      \n" - "
    • bar
    • \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "For the same reason, we can't omit the `> ` in front of" -- "\nsubsequent lines of an indented or fenced code block:\n\n" +- "\n\nFor the same reason, we can't omit the `> `" +- " in front of\nsubsequent lines of an indented or fenced code block:\n\n" - "````````````````````````````````" - " example\n" - "> foo\n bar\n.\n
    \n" @@ -2313,9 +2313,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nTo see why, note that in\n" - "\n```markdown\n> foo\n> - bar\n```" -- "\n\n" -- "the `- bar` is indented too far to start a list, and " -- "can't\nbe an indented code block because indented code blocks cannot\n" +- "\n\nthe `- bar`" +- " is indented too far to start a list, and can't\n" +- "be an indented code block because indented code blocks cannot\n" - "interrupt paragraphs, so it is [paragraph continuation text].\n\n" - "A block quote can be empty:\n" - "\n" @@ -2340,8 +2340,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n\n> bar\n.\n
    \n

    foo

    \n" - "
    \n
    \n

    bar

    \n
    \n" - "````````````````````````````````\n" -- "\n\n(Most current Markdown implementations, including John Gruber's\n" -- "original `Markdown.pl`" +- "\n\n(Most current Markdown implementations, including John Gruber's\noriginal " +- "`Markdown.pl`" - ", will parse this example as a single block quote\n" - "with two paragraphs. But it seems better to allow the author to decide\n" - "whether two block quotes or one are wanted.)\n\n" @@ -2421,18 +2421,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "## List items\n\n" -- "A [list marker](@) is a\n" -- "[bullet list marker] or an [ordered list marker].\n\n" -- "A [bullet list marker](@)\n" -- "is a `-`, `+`, or `*` character.\n\n" -- "An [ordered list marker](@)\n" -- "is a sequence of 1--9 arabic digits (`0-9`)" -- ", followed by either a\n`.` character or a `)`" +- "A [list marker](@)" +- " is a\n[bullet list marker] or an [ordered list marker]." +- "\n\nA [bullet list marker](@)\nis a `-`, `+`" +- ", or `*` character.\n\n" +- "An [ordered list marker](@)" +- "\nis a sequence of 1--9 arabic digits (" +- "`0-9`), followed by either a\n`.` character or a " +- "`)`" - " character. (The reason for the length\n" - "limit is that with 10 digits we start seeing integer overflows\n" - "in some browsers.)\n\nThe following rules define [list items]:\n\n" -- 1. **Basic case. -- "** If a sequence of lines *Ls*" +- "1. " +- "**Basic case.** If a sequence of lines *Ls*" - " constitute a sequence of\n blocks *Bs*" - " starting with a character other than a space or tab, and *M* is" - "\n a list marker of width *W* followed by 1 ≤ " @@ -2445,15 +2446,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "(bullet or ordered) is determined by the type of its list marker.\n " - "If the list item is ordered, then it is also assigned a start\n " - "number, based on the ordered list marker.\n\n Exceptions:\n\n " -- "1. When the first list item in a [list] interrupts\n" -- " a paragraph---that is, when it starts on a line that would" -- "\n otherwise count as [paragraph continuation text]---then (a)\n " +- "1. When the first list item in a [list] interrupts\n " +- "a paragraph---that is, when it starts on a line that would\n " +- "otherwise count as [paragraph continuation text]---then (a)\n " - "the lines *Ls* must not begin with a blank line, and (" - "b) if\n " - "the list item is ordered, the start number must be 1.\n " -- "2. " -- "If any line is a [thematic break][thematic breaks] then" -- "\n that line is not a list item.\n\n" +- "2. If any line is a [thematic break][thematic breaks]" +- " then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n" - "\n" - "````````````````````````````````" @@ -2463,8 +2463,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    indented code\n
    \n
    \n" - "

    A block quote.

    \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "And let *M* be the marker `1.`, and *N*" +- "\n\nAnd let *M* be the marker `1.`, and " +- "*N*" - " = 2. Then rule #1 says\n" - "that the following is an ordered list item with start number 1,\n" - "and the same contents as *Ls*:\n\n" @@ -2510,8 +2510,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    one

    \n

    two

    \n\n" - "\n" - "````````````````````````````````\n" -- "\n\n" -- "It is tempting to think of this in terms of columns: the continuation\n" +- "\n\nIt is tempting to think of this in terms of columns: the continuation" +- "\n" - blocks must be indented at least to the column of the first character other than - "\n" - "a space or tab after the list marker. " @@ -2527,12 +2527,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    one

    \n

    two

    \n\n" - "\n
    \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "Here `two` occurs in the same column as the list marker `1.`,\n" -- "but is actually contained in the list item, because there is\n" -- "sufficient indentation after the last containing blockquote marker.\n\n" -- "The converse is also possible. " -- "In the following example, the word `two`" +- "\n\nHere `two` occurs in the same column as the list marker " +- "`1.`" +- ",\nbut is actually contained in the list item, because there is\n" +- sufficient indentation after the last containing blockquote marker. +- "\n\nThe converse is also possible. In the following example, the word " +- "`two`" - "\n" - "occurs far to the right of the initial text of the list item, `" - "one`, but\n" @@ -2544,9 +2544,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n
      \n
    • one
    • \n
    \n" - "

    two

    \n
    \n\n" - "````````````````````````````````\n" -- "\n\n" -- "Note that at least one space or tab is needed between the list marker and\n" -- "any following content, so these are not list items:\n\n" +- "\n\nNote that at least one space or tab is needed between the list marker and" +- "\nany following content, so these are not list items:\n\n" - "````````````````````````````````" - " example\n" - "-one\n\n2.two\n.\n

    -one

    \n" @@ -2610,8 +2609,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " example\n" - "-1. not ok\n.\n

    -1. not ok

    \n" - "````````````````````````````````\n" -- "\n\n\n2. **Item starting with indented code." -- "** If a sequence of lines *Ls*" +- "\n\n\n2. **Item starting with indented code.**" +- " If a sequence of lines *Ls*" - "\n constitute a sequence of blocks *Bs*" - " starting with an indented code\n block, and *M*" - " is a list marker of width *W*" @@ -2644,10 +2643,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    bar\n
    \n\n" - "\n" - "````````````````````````````````\n" -- "\n\n" -- If the *first* block in the list item is an indented code block -- ",\nthen by rule #2, the contents must be preceded by *one*" -- " space of indentation\nafter the list marker:\n\n" +- "\n\nIf the *first*" +- " block in the list item is an indented code block,\n" +- "then by rule #2, the contents must be preceded by *one* space" +- " of indentation\nafter the list marker:\n\n" - "````````````````````````````````" - " example\n" - " indented code\n\nparagraph\n\n more code\n.\n" @@ -2706,8 +2705,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n

    bar

    \n\n" - "\n" - "````````````````````````````````\n" -- "\n\n3. **Item starting with a blank line." -- "** If a sequence of lines *Ls*" +- "\n\n3. **Item starting with a blank line.**" +- " If a sequence of lines *Ls*" - "\n starting with a single [blank line] constitute a (possibly empty)" - "\n sequence of blocks *Bs*, and *M*" - " is a list marker of width *W*" @@ -2784,8 +2783,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n*\n\nfoo\n1.\n.\n

    foo\n*

    \n" - "

    foo\n1.

    \n" - "````````````````````````````````\n" -- "\n\n4. **Indentation." -- "** If a sequence of lines *Ls*" +- "\n\n4. **Indentation.** If a sequence of lines " +- "*Ls*" - " constitutes a list item\n " - "according to rule #1, #2, or #3, then the result" - " of preceding each line\n of *Ls*" @@ -2833,8 +2832,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " indented code\n\n > A block quote.\n" - "
    \n" - "````````````````````````````````\n" -- "\n\n\n5. **Laziness." -- "** If a string of lines *Ls* constitute a " +- "\n\n\n5. **Laziness.**" +- " If a string of lines *Ls* constitute a " - "[list\n item](#list-items) with contents *Bs*" - ", then the result of deleting\n " - "some or all of the indentation from one or more lines in which the\n " @@ -2876,14 +2875,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "continued here.

    \n\n\n\n" - "\n" - "````````````````````````````````\n" -- "\n\n\n6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items).\n\n" -- "The rules for sublists follow from the general rules\n" -- "[above][List items]. A sublist must be indented the same number" -- "\nof spaces of indentation a paragraph would need to be in order to be included" -- "\nin the list item.\n\n" -- "So, in this case we need two spaces indent:\n" +- "\n\n\n6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items).\n\n" +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number\n" +- "of spaces of indentation a paragraph would need to be in order to be included\n" +- "in the list item.\n\nSo, in this case we need two spaces indent:\n" - "\n" - "````````````````````````````````" - " example\n" @@ -2942,25 +2940,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "### Motivation\n\n" - "John Gruber's Markdown spec says the following about list items:\n\n" -- "1. " -- "\"List markers typically start at the left margin, but may be indented\n" -- " by up to three spaces. List markers must be followed by one or more" +- "1. \"" +- "List markers typically start at the left margin, but may be indented\n " +- by up to three spaces. List markers must be followed by one or more - "\n spaces or a tab.\"\n\n" -- "2. " -- "\"To make lists look nice, you can wrap items with hanging indents....\n" -- " But if you don't want to, you don't have to.\"\n\n" -- "3. \"List items may consist of multiple paragraphs. Each subsequent\n" -- " paragraph in a list item must be indented by either 4 spaces or" -- " one\n tab.\"\n\n" -- "4. \"It looks nice if you indent every line of the subsequent paragraphs,\n" -- " but here again, Markdown will allow you to be lazy.\"\n\n" -- "5. " -- "\"To put a blockquote within a list item, the blockquote's `>`\n" -- " delimiters need to be indented.\"\n\n" -- "6. " -- "\"To put a code block within a list item, the code block needs to be" -- "\n indented twice — 8 spaces or two tabs.\"\n\n" -- "These rules specify that a paragraph under a list item must be indented\n" +- "2. \"" +- "To make lists look nice, you can wrap items with hanging indents....\n " +- "But if you don't want to, you don't have to.\"\n\n" +- "3. \"List items may consist of multiple paragraphs. Each subsequent\n " +- paragraph in a list item must be indented by either 4 spaces or one +- "\n tab.\"\n\n" +- "4. \"It looks nice if you indent every line of the subsequent paragraphs," +- "\n but here again, Markdown will allow you to be lazy.\"" +- "\n\n5. \"" +- "To put a blockquote within a list item, the blockquote's `>`" +- "\n delimiters need to be indented.\"\n\n" +- "6. \"" +- "To put a code block within a list item, the code block needs to be" +- "\n indented twice — 8 spaces or two tabs.\"" +- "\n\nThese rules specify that a paragraph under a list item must be indented\n" - "four spaces (presumably, from the left margin, rather than the start of" - "\n" - "the list marker, but this is not said), and that code under a list" @@ -3019,12 +3017,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```\n\n" - "The choice of four spaces is arbitrary. " - "It can be learned, but it is\n" -- "not likely to be guessed, and it trips up beginners regularly.\n\n" -- "Would it help to adopt a two-space rule? " -- "The problem is that such\n" -- "a rule, together with the rule allowing up to three spaces of indentation for\n" -- "the initial list marker, allows text that is indented *less than* the" -- "\noriginal list marker to be included in the list item. For example,\n" +- "not likely to be guessed, and it trips up beginners regularly." +- "\n\n" +- Would it help to adopt a two-space rule? The problem is that such +- "\na rule, together with the rule allowing up to three spaces of indentation for" +- "\nthe initial list marker, allows text that is indented *less than*" +- " the\noriginal list marker to be included in the list item. For example,\n" - "`Markdown.pl` parses\n\n" - "``` markdown\n - one\n\n two\n```" - "\n\nas a single list item, with `two` a continuation paragraph:\n" @@ -3044,19 +3042,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as a list item with a subparagraph, even though the paragraph `bar`" - "\nis not indented as far as the first paragraph `foo`:\n\n" - "``` markdown\n 10. foo\n\n bar \n```" -- "\n\n" -- "Arguably this text does read like a list item with `bar` as a" -- " subparagraph,\n" +- "\n\nArguably this text does read like a list item with `bar`" +- " as a subparagraph,\n" - "which may count in favor of the proposal. " - "However, on this proposal indented\n" - "code would have to be indented six spaces after the list marker. " - "And this\nwould break a lot of existing Markdown, which has the pattern:\n\n" - "``` markdown\n1. foo\n\n indented code\n```" -- "\n\nwhere the code is indented eight spaces. " +- "\n\n" +- "where the code is indented eight spaces. " - "The spec above, by contrast, will\n" - "parse this text as expected, since the code block's indentation is measured\n" - "from the beginning of `foo`.\n\n" -- "The one case that needs special treatment is a list item that *starts*\n" +- The one case that needs special treatment is a list item that *starts* +- "\n" - "with indented code. How much indentation is required in that case, since" - "\nwe don't have a \"first paragraph\"" - " to measure from? Rule #2 simply stipulates\n" @@ -3066,24 +3065,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "four-space rule in cases where the list marker plus its initial indentation\n" - "takes four spaces (a common case), but diverge in other cases.\n\n" - "## Lists\n\n" -- "A [list](@) is a sequence of one or more\n" -- "list items [of the same type]. The list items\n" -- "may be separated by any number of blank lines.\n\n" -- "Two list items are [of the same type](@)\n" -- "if they begin with a [list marker] of the same type.\n" +- "A [list](@)" +- " is a sequence of one or more\nlist items [of the same type]" +- ". The list items\nmay be separated by any number of blank lines." +- "\n\nTwo list items are [of the same type](@)" +- "\nif they begin with a [list marker] of the same type.\n" - "Two list markers are of the\n" - "same type if (a) they are bullet list markers using the same character\n" - "(`-`, `+`, or `*`) or (b) they are" - " ordered list numbers with the same\ndelimiter (either `.` or `)`).\n\n" -- "A list is an [ordered list](@)\n" -- "if its constituent list items begin with\n[ordered list markers], and a\n" -- "[bullet list](@)" +- "A list is an [ordered list](@)" +- "\nif its constituent list items begin with\n[ordered list markers], and a" +- "\n[bullet list](@)" - " if its constituent list\nitems begin with [bullet list markers].\n\n" -- "The [start number](@)\n" -- "of an [ordered list] is determined by the list number of\n" +- "The [start number](@)" +- "\nof an [ordered list] is determined by the list number of\n" - "its initial list item. The numbers of subsequent list items are\n" - "disregarded.\n\n" -- "A list is [loose](@) if any of its constituent\n" +- "A list is [loose](@)" +- " if any of its constituent\n" - "list items are separated by blank lines, or if any of its constituent\n" - "list items directly contain two block-level elements with a blank line\n" - "between them. Otherwise a list is [tight](@)" @@ -3112,8 +3112,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
      \n
    • bar
    • \n
    • baz
    • \n" - "
    \n" - "````````````````````````````````\n" -- "\n" -- "`Markdown.pl` does not allow this, through fear of triggering a list\n" +- "\n`Markdown.pl`" +- " does not allow this, through fear of triggering a list\n" - "via a numeral in a hard-wrapped line:\n\n" - "``` markdown\nThe number of windows in my house is\n14. " - "The number of doors is 6.\n```\n\n" @@ -3124,8 +3124,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to start lists without blank lines:\n\n" - "``` markdown\nI need to buy\n- new shoes\n- a coat\n" - "- a plane ticket\n```\n\nSecond, we are attracted to a\n\n" -- "> [principle of uniformity](@):\n" -- "> if a chunk of text has a certain\n> " +- "> " +- "[principle of uniformity](@)" +- ":\n> if a chunk of text has a certain\n> " - "meaning, it will continue to have the same meaning when put into a" - "\n> container block (such as a list item or blockquote).\n\n" - "(Indeed, the spec for [list items] and [block quotes] " @@ -3144,8 +3145,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " requires us to allow this outside list items as\nwell. (" - "[reStructuredText](https://docutils.sourceforge.net/rst.html)" - "\ntakes a different approach, requiring blank lines before lists\n" -- "even inside other list items.)\n\n" -- "In order to solve the problem of unwanted lists in paragraphs with\n" +- even inside other list items.) +- "\n\nIn order to solve the problem of unwanted lists in paragraphs with\n" - "hard-wrapped numerals, we allow only lists starting with `1` to" - "\ninterrupt paragraphs. Thus,\n\n" - "````````````````````````````````" @@ -3202,8 +3203,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n\n\n\n" - "
    code\n
    \n" - "````````````````````````````````\n" -- "\n\nList items need not be indented to the same level. " -- "The following\nlist items will be treated as items at the same list level,\n" +- "\n\nList items need not be indented to the same level. The following" +- "\nlist items will be treated as items at the same list level,\n" - "since none is indented enough to belong to the previous list\nitem:\n\n" - "````````````````````````````````" - " example\n" @@ -3232,9 +3233,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • b
  • \n
  • c
  • \n
  • d\n" - "- e
  • \n\n" - "````````````````````````````````\n" -- "\n" -- "And here, `3. c` is treated as in indented code block" -- ",\nbecause it is indented four spaces and preceded by a\nblank line.\n\n" +- "\nAnd here, `3. c`" +- " is treated as in indented code block,\n" +- "because it is indented four spaces and preceded by a\nblank line.\n\n" - "````````````````````````````````" - " example\n" - "1. a\n\n 2. b\n\n 3. c\n" @@ -3360,24 +3361,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`hi`lo`\n.\n" - "

    hilo`

    \n" - "````````````````````````````````\n" -- "\n" -- "`hi` is parsed as code, leaving the backtick at the end as" -- " a literal\nbacktick.\n\n\n\n" +- "\n`hi`" +- " is parsed as code, leaving the backtick at the end as a literal\n" +- "backtick.\n\n\n\n" - "## Code spans\n\n" -- "A [backtick string](@)\n" -- "is a string of one or more backtick characters (`` ` ``) that" -- " is neither\npreceded nor followed by a backtick.\n\n" +- "A [backtick string](@)" +- "\nis a string of one or more backtick characters (`` ` ``" +- ") that is neither\npreceded nor followed by a backtick.\n\n" - "A [code span](@) begins with a backtick string and ends with" - "\n" - a backtick string of equal length. The contents of the code span are - "\nthe characters between these two backtick strings, normalized in the\n" - "following ways:\n\n" - "- First, [line endings] are converted to [spaces].\n" -- "- If the resulting string both begins *and* ends with a [space]\n" -- " character, but does not consist entirely of [space]\n " -- "characters, a single [space] character is removed from the\n " -- "front and back. This allows you to include code that begins\n " -- "or ends with backtick characters, which must be separated by\n " +- "- If the resulting string both begins *and*" +- " ends with a [space]\n character, but does not consist entirely of [" +- "space]\n characters, a single [space] character is removed from the" +- "\n front and back. This allows you to include code that begins" +- "\n or ends with backtick characters, which must be separated by\n " - "whitespace from the opening or closing backtick strings.\n\n" - "This is a simple code span:\n" - "\n" @@ -3440,12 +3441,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`foo bar \nbaz`\n.\n" - "

    foo bar baz

    \n" - "````````````````````````````````\n" -- "\nNote that browsers will typically collapse consecutive spaces\n" -- "when rendering ``" +- "\nNote that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:\n\n " - "code{white-space: pre-wrap;}\n" -- "\n\nNote that backslash escapes do not work in code spans. " -- "All backslashes\nare treated literally:\n\n" +- "\n\nNote that backslash escapes do not work in code spans. All backslashes" +- "\nare treated literally:\n\n" - "````````````````````````````````" - " example\n" - "`foo\\`bar`\n.\n" @@ -3531,9 +3531,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n" - "## Emphasis and strong emphasis\n\n" -- "John Gruber's original [Markdown syntax\n" -- "description](https://daringfireball.net/projects/markdown/syntax#em" -- ") says:\n\n" +- "John Gruber's original " +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown/syntax#em)" +- " says:\n\n" - "> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of" - "\n> emphasis. Text wrapped with one `*` or `_`" - " will be wrapped with an HTML\n> `` tag; double " @@ -3551,36 +3552,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "is clear and they are useful (especially in contexts like bibliography\nentries):\n\n" - "``` markdown\n*emph *with emph* in it*\n" - "**strong **with strong** in it**\n```\n\n" -- "Many implementations have also restricted intraword emphasis to\n" -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - " forms, to avoid unwanted emphasis in words containing\n" - "internal underscores. (It is best practice to put these in code\n" - "spans, but users often do not.)\n\n" - "``` markdown\ninternal emphasis: foo*bar*baz\n" - "no emphasis: foo_bar_baz\n```\n\n" - "The rules given below capture all of these patterns, while allowing\n" -- "for efficient parsing strategies that do not backtrack.\n\n" -- "First, some definitions. A [delimiter run](@) is either\n" -- "a sequence of one or more `*`" +- for efficient parsing strategies that do not backtrack. +- "\n\nFirst, some definitions. A [delimiter run](@)" +- " is either\na sequence of one or more `*`" - " characters that is not preceded or\n" - "followed by a non-backslash-escaped `*` character, or a" - " sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped " - "`_` character.\n\n" -- "A [left-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not followed by [Unicode whitespace" -- "],\nand either (2a) not followed by a [Unicode punctuation character]" -- ", or\n(2b) followed by a [Unicode punctuation character] and" -- "\npreceded by [Unicode whitespace] or a [Unicode punctuation character].\n" -- "For purposes of this definition, the beginning and the end of\n" -- "the line count as Unicode whitespace.\n\n" -- "A [right-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not preceded by [Unicode whitespace" -- "],\nand either (2a) not preceded by a [Unicode punctuation character]" -- ", or\n(2b) preceded by a [Unicode punctuation character] and" -- "\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\n" -- "For purposes of this definition, the beginning and the end of\n" -- "the line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" +- "A [left-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not followed by [" +- "Unicode whitespace],\nand either (2a) not followed by a [" +- "Unicode punctuation character], or\n(2b) followed by a [" +- "Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [" +- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of" +- "\nthe line count as Unicode whitespace.\n\n" +- "A [right-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not preceded by [" +- "Unicode whitespace],\nand either (2a) not preceded by a [" +- "Unicode punctuation character], or\n(2b) preceded by a [" +- "Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [" +- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of" +- "\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" - " - left-flanking but not right-flanking:\n\n ```\n" - " ***abc\n _abc\n **\"abc\"\n " - " _\"abc\"\n ```\n\n" @@ -3600,48 +3600,59 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " and its rules for distinguishing left- and right-flanking runs\n" - "are a bit more complex than the ones given here.)\n\n" - "The following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n" -- " iff (if and only if) it is part of a [left-" -- "flanking delimiter run].\n\n" -- "2. A single `_` character [can open emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " +- "1. " +- "A single `*` character [can open emphasis](@)" +- "\n iff (if and only if) it is part of a [" +- "left-flanking delimiter run].\n\n" +- "2. " +- "A single `_`" +- " character [can open emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]\n " - "and either (a) not part of a [right-flanking delimiter run]" - "\n or (b) part of a [right-flanking delimiter run]" - "\n preceded by a [Unicode punctuation character].\n\n" -- "3. A single `*` character [can close emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run].\n\n" -- "4. A single `_` character [can close emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " +- "3. " +- "A single `*` character [can close emphasis](@)" +- "\n iff it is part of a [right-flanking delimiter run]." +- "\n\n4. " +- "A single `_`" +- " character [can close emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]\n " - "and either (a) not part of a [left-flanking delimiter run]" - "\n or (b) part of a [left-flanking delimiter run]" - "\n followed by a [Unicode punctuation character].\n\n" -- "5. A double `**` [can open strong emphasis](@)\n" -- " iff it is part of a [left-flanking delimiter run].\n\n" -- "6. A double `__` [can open strong emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " +- "5. " +- "A double `**` [can open strong emphasis](@)" +- "\n iff it is part of a [left-flanking delimiter run]." +- "\n\n6. " +- "A double `__`" +- " [can open strong emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]\n " - "and either (a) not part of a [right-flanking delimiter run]" - "\n or (b) part of a [right-flanking delimiter run]" - "\n preceded by a [Unicode punctuation character].\n\n" -- "7. A double `**` [can close strong emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run].\n\n" -- "8. A double `__` [can close strong emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " +- "7. " +- "A double `**` [can close strong emphasis](@)" +- "\n iff it is part of a [right-flanking delimiter run]." +- "\n\n8. " +- "A double `__`" +- " [can close strong emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]\n " - "and either (a) not part of a [left-flanking delimiter run]" - "\n or (b) part of a [left-flanking delimiter run]" -- "\n followed by a [Unicode punctuation character].\n\n" -- "9. " -- "Emphasis begins with a delimiter that [can open emphasis] and ends\n" -- " with a delimiter that [can close emphasis], and that uses the same" -- "\n character (`_` or `*`" +- "\n followed by a [Unicode punctuation character]." +- "\n\n9. Emphasis begins with a delimiter that [can open emphasis]" +- " and ends\n with a delimiter that [can close emphasis]" +- ", and that uses the same\n character (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs]" - ". If one of the delimiters can both\n " - "open and close emphasis, then the sum of the lengths of the\n " - "delimiter runs containing the opening and closing delimiters\n " - "must not be a multiple of 3 unless both lengths are\n " -- "multiples of 3.\n\n" -- "10. Strong emphasis begins with a delimiter that\n" -- " [can open strong emphasis] and ends with a delimiter that\n [" +- multiples of 3. +- "\n\n10. Strong emphasis begins with a delimiter that\n [" +- "can open strong emphasis] and ends with a delimiter that\n [" - "can close strong emphasis], and that uses the same character\n (`_`" - " or `*`" - ") as the opening delimiter. The\n " @@ -3652,35 +3663,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiters must not be a multiple of 3 unless both lengths\n " - "are multiples of 3.\n\n" - "11. " -- "A literal `*` character cannot occur at the beginning or end of\n" -- " `*`-delimited emphasis or `**`-delimited strong" -- " emphasis, unless it\n is backslash-escaped.\n\n" -- "12. A literal `_` character cannot occur at the beginning or end of\n" -- " `_`-delimited emphasis or `__`-delimited strong emphasis" +- "A literal `*` character cannot occur at the beginning or end of\n " +- "`*`-delimited emphasis or `**`-delimited strong emphasis" +- ", unless it\n is backslash-escaped.\n\n" +- "12. " +- "A literal `_` character cannot occur at the beginning or end of\n " +- "`_`-delimited emphasis or `__`-delimited strong emphasis" - ", unless it\n is backslash-escaped.\n\n" - "Where rules 1--12 above are compatible with multiple parsings,\n" -- "the following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. Thus, for example,\n" -- " an interpretation `...` is always preferred to\n " +- "the following principles resolve ambiguity:" +- "\n\n13. " +- "The number of nestings should be minimized. Thus, for example,\n " +- "an interpretation `...` is always preferred to\n " - "`...`.\n\n" - "14. " -- "An interpretation `...` is always\n" -- " preferred to `...`.\n\n" -- "15. When two potential emphasis or strong emphasis spans overlap,\n" -- " so that the second begins before the first ends and ends after\n " +- "An interpretation `...`" +- " is always\n preferred to `...`" +- "." +- "\n\n15. When two potential emphasis or strong emphasis spans overlap,\n " +- "so that the second begins before the first ends and ends after\n " - "the first ends, the first takes precedence. Thus, for example,\n " - "`*foo _bar* baz_` is parsed as `foo" - " _bar baz_` rather\n than " - "`*foo bar* baz`.\n\n" -- "16. When there are two potential emphasis or strong emphasis spans\n" -- " with the same closing delimiter, the shorter one (the one that\n " +- "16. When there are two potential emphasis or strong emphasis spans\n " +- "with the same closing delimiter, the shorter one (the one that\n " - "opens later) takes precedence. Thus, for example,\n " - "`**foo **bar baz**` is parsed as `**foo bar baz
    `\n rather than " - "`foo **bar baz`.\n\n" -- "17. " -- "Inline code spans, links, images, and HTML tags group more tightly\n" -- " than emphasis. So, when there is a choice between an interpretation" +- "17. Inline code spans, links, images, and HTML tags group more tightly" +- "\n than emphasis. So, when there is a choice between an interpretation" - "\n that contains one of these elements and one that does not, the" - "\n former always wins. Thus, for example, " - "`*[foo*](bar)` is\n parsed as " @@ -3692,14 +3705,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " example\n" - "*foo bar*\n.\n

    foo bar

    \n" - "````````````````````````````````\n" -- "\n\nThis is not emphasis, because the opening `*` is followed by\n" -- "whitespace, and hence not part of a [left-flanking delimiter run]:\n\n" +- "\n\nThis is not emphasis, because the opening `*`" +- " is followed by\nwhitespace, and hence not part of a [" +- "left-flanking delimiter run]:\n\n" - "````````````````````````````````" - " example\na * foo bar*\n.\n

    a * foo bar*

    \n" - "````````````````````````````````\n" -- "\n\nThis is not emphasis, because the opening `*` is preceded\n" -- "by an alphanumeric and followed by punctuation, and hence\nnot part of a [" -- "left-flanking delimiter run]:\n\n" +- "\n\nThis is not emphasis, because the opening `*`" +- " is preceded\nby an alphanumeric and followed by punctuation, and hence\n" +- "not part of a [left-flanking delimiter run]:\n\n" - "````````````````````````````````" - " example\n" - "a*\"foo\"*\n.\n" @@ -3738,13 +3752,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " example\n" - "_foo bar_\n.\n

    foo bar

    \n" - "````````````````````````````````\n" -- "\n\nThis is not emphasis, because the opening `_` is followed by\n" -- "whitespace:\n\n" +- "\n\nThis is not emphasis, because the opening `_`" +- " is followed by\nwhitespace:\n\n" - "````````````````````````````````" - " example\n_ foo bar_\n.\n

    _ foo bar_

    \n" - "````````````````````````````````\n" -- "\n\nThis is not emphasis, because the opening `_` is preceded\n" -- "by an alphanumeric and followed by punctuation:\n\n" +- "\n\nThis is not emphasis, because the opening `_`" +- " is preceded\nby an alphanumeric and followed by punctuation:\n\n" - "````````````````````````````````" - " example\n" - "a_\"foo\"_\n.\n" @@ -3767,7 +3781,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "пристаням_стремятся_\n.\n" - "

    пристаням_стремятся_

    \n" - "````````````````````````````````\n" -- "\n\nHere `_` does not generate emphasis, because the first delimiter run\n" +- "\n\nHere `_`" +- " does not generate emphasis, because the first delimiter run\n" - "is right-flanking and the second left-flanking:\n\n" - "````````````````````````````````" - " example\n" @@ -3788,8 +3803,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n_foo*\n.\n

    _foo*

    \n" - "````````````````````````````````\n" -- "\n\nThis is not emphasis, because the closing `*` is preceded by\n" -- "whitespace:\n\n" +- "\n\nThis is not emphasis, because the closing `*`" +- " is preceded by\nwhitespace:\n\n" - "````````````````````````````````" - " example\n*foo bar *\n.\n

    *foo bar *

    \n" - "````````````````````````````````\n" @@ -3798,8 +3813,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n" - "````````````````````````````````\n" -- "\n\nThis is not emphasis, because the second `*` is\n" -- "preceded by punctuation and followed by an alphanumeric\n" +- "\n\nThis is not emphasis, because the second `*`" +- " is\npreceded by punctuation and followed by an alphanumeric\n" - "(hence it is not part of a [right-flanking delimiter run]:\n\n" - "````````````````````````````````" - " example\n*(*foo)\n.\n

    *(*foo)

    \n" @@ -3819,13 +3834,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobar

    \n" - "````````````````````````````````\n" - "\n\n\nRule 4:\n" -- "\nThis is not emphasis, because the closing `_` is preceded by\n" -- "whitespace:\n\n" +- "\nThis is not emphasis, because the closing `_`" +- " is preceded by\nwhitespace:\n\n" - "````````````````````````````````" - " example\n_foo bar _\n.\n

    _foo bar _

    \n" - "````````````````````````````````\n" -- "\n\nThis is not emphasis, because the second `_` is\n" -- "preceded by punctuation and followed by an alphanumeric:\n\n" +- "\n\nThis is not emphasis, because the second `_`" +- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````````````````````" - " example\n_(_foo)\n.\n

    _(_foo)

    \n" - "````````````````````````````````\n" @@ -3871,9 +3886,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n** foo bar**\n.\n

    ** foo bar**

    \n" - "````````````````````````````````\n" -- "\n\nThis is not strong emphasis, because the opening `**` is preceded\n" -- "by an alphanumeric and followed by punctuation, and hence\nnot part of a [" -- "left-flanking delimiter run]:\n\n" +- "\n\nThis is not strong emphasis, because the opening `**`" +- " is preceded\nby an alphanumeric and followed by punctuation, and hence\n" +- "not part of a [left-flanking delimiter run]:\n\n" - "````````````````````````````````" - " example\n" - "a**\"foo\"**\n.\n" @@ -3900,8 +3915,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n" - "````````````````````````````````\n" -- "\n\nThis is not strong emphasis, because the opening `__` is preceded\n" -- "by an alphanumeric and followed by punctuation:\n\n" +- "\n\nThis is not strong emphasis, because the opening `__`" +- " is preceded\nby an alphanumeric and followed by punctuation:\n\n" - "````````````````````````````````" - " example\n" - "a__\"foo\"__\n.\n" @@ -3945,11 +3960,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n**foo bar **\n.\n

    **foo bar **

    \n" - "````````````````````````````````\n" -- "\n\n" -- "(Nor can it be interpreted as an emphasized `*foo bar *`, because" -- " of\nRule 11.)\n\n" -- "This is not strong emphasis, because the second `**` is\n" -- "preceded by punctuation and followed by an alphanumeric:\n\n" +- "\n\n(Nor can it be interpreted as an emphasized `*foo bar *`" +- ", because of\nRule 11.)" +- "\n\nThis is not strong emphasis, because the second `**`" +- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````````````````````" - " example\n**(**foo)\n.\n

    **(**foo)

    \n" - "````````````````````````````````\n" @@ -3992,8 +4006,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " example\n" - "__foo bar __\n.\n

    __foo bar __

    \n" - "````````````````````````````````\n" -- "\n\nThis is not strong emphasis, because the second `__` is\n" -- "preceded by punctuation and followed by an alphanumeric:\n\n" +- "\n\nThis is not strong emphasis, because the second `__`" +- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````````````````````" - " example\n__(__foo)\n.\n

    __(__foo)

    \n" - "````````````````````````````````\n" @@ -4102,9 +4116,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo**bar*\n.\n" - "

    foo**bar

    \n" - "````````````````````````````````\n" -- "\n\nThe same condition ensures that the following\n" -- "cases are all strong emphasis nested inside\nemphasis, even when the interior whitespace is" -- "\nomitted:\n\n\n" +- "\n\nThe same condition ensures that the following\ncases are all strong emphasis nested inside" +- "\nemphasis, even when the interior whitespace is\nomitted:\n\n\n" - "````````````````````````````````" - " example\n" - "***foo** bar*\n.\n" @@ -4122,8 +4135,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo**bar***\n.\n" - "

    foobar

    \n" - "````````````````````````````````\n" -- "\n\nWhen the lengths of the interior closing and opening\n" -- delimiter runs are *both* +- "\n\nWhen the lengths of the interior closing and opening\ndelimiter runs are " +- "*both*" - " multiples of 3, though,\nthey can match to create emphasis:\n\n" - "````````````````````````````````" - " example\n" @@ -4530,15 +4543,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]\n(the URI that is the link destination), and optionally a [link title].\n" - "There are two basic kinds of links in Markdown. In [inline links]" - " the\ndestination and title are given immediately after the link text. In\n" -- "[reference links] the destination and title are defined elsewhere in\nthe document.\n\n" -- "A [link text](@) consists of a sequence of zero or more\n" -- "inline elements enclosed by square brackets (`[` and `]`" -- "). The\nfollowing rules apply:\n\n" -- "- Links may not contain other links, at any level of nesting. If\n" -- " multiple otherwise valid link definitions appear nested inside each\n " -- "other, the inner-most definition is used.\n\n" -- "- Brackets are allowed in the [link text] only if (a)" -- " they\n " +- "[reference links] the destination and title are defined elsewhere in\nthe document." +- "\n\nA [link text](@)" +- " consists of a sequence of zero or more\ninline elements enclosed by square brackets (" +- "`[` and `]`). The\nfollowing rules apply:\n\n" +- "- Links may not contain other links, at any level of nesting. If" +- "\n multiple otherwise valid link definitions appear nested inside each\n " +- "other, the inner-most definition is used." +- "\n\n- Brackets are allowed in the [link text]" +- " only if (a) they\n " - are backslash-escaped or (b) they appear as a matched pair of - " brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n " @@ -4548,30 +4561,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "than the brackets in link text. Thus, for example,\n " - "`` [foo`]` `` could not be a link text, since the second" - " `]`\n is part of a code span.\n\n" -- "- The brackets in link text bind more tightly than markers for\n" -- " [emphasis and strong emphasis]. Thus, for example, " +- "- The brackets in link text bind more tightly than markers for\n [" +- "emphasis and strong emphasis]. Thus, for example, " - "`*[foo*](url)` is a link.\n\n" - "A [link destination](@) consists of either\n\n" -- "- a sequence of zero or more characters between an opening `<` and a\n" -- " closing `>` that contains no line endings or unescaped\n `<`" -- " or `>` characters, or\n\n" -- "- a nonempty sequence of characters that does not start with `<`,\n" -- " does not include [ASCII control characters][ASCII control character]\n or [" -- "space] character, and includes parentheses only if (a) they are\n " +- "- " +- "a sequence of zero or more characters between an opening `<`" +- " and a\n closing `>`" +- " that contains no line endings or unescaped\n `<` or `>`" +- " characters, or" +- "\n\n- " +- "a nonempty sequence of characters that does not start with `<`" +- ",\n does not include [ASCII control characters][ASCII control character]\n " +- "or [space] character, and includes parentheses only if (a) they are" +- "\n " - backslash-escaped or (b) they are part of a balanced pair of - "\n unescaped parentheses.\n " - "(Implementations may impose limits on parentheses nesting to\n " - "avoid performance issues, but at least three levels of nesting\n " - "should be supported.)\n\nA [link title](@) consists of either\n\n" -- "- a sequence of zero or more characters between straight double-quote\n" -- " characters (`\"`), including a `\"`" -- " character only if it is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between straight single-quote\n" -- " characters (`'`), including a `'`" -- " character only if it is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between matching parentheses\n" -- " (`(...)`), including a `(` or `)` character only if it" -- " is\n backslash-escaped.\n\n" +- "- " +- "a sequence of zero or more characters between straight double-quote\n characters (`\"`" +- "), including a `\"`" +- " character only if it is\n backslash-escaped, or" +- "\n\n- a sequence of zero or more characters between straight single-quote\n " +- "characters (`'`), including a `'`" +- " character only if it is\n backslash-escaped, or" +- "\n\n- " +- "a sequence of zero or more characters between matching parentheses\n (`(...)`" +- "), including a `(` or `)`" +- " character only if it is\n backslash-escaped.\n\n" - "Although [link titles] may span multiple lines, they may not contain\n" - "a [blank line].\n\n" - "An [inline link](@) consists of a [link text] followed immediately" @@ -4582,9 +4601,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "These four components may be separated by spaces, tabs, and up to one line" - "\nending.\nIf both [link destination] and [link title]" - " are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending.\n\n" -- "The link's text consists of the inlines contained\n" -- "in the [link text] (excluding the enclosing square brackets).\nThe link'" +- " be\nseparated by spaces, tabs, and up to one line ending." +- "\n\nThe link's text consists of the inlines contained\nin the [" +- "link text] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n`<...>`" - " if present, with backslash-escapes in effect as described\n" - "above. The link's title consists of the link title, excluding its\n" @@ -4651,8 +4670,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[link]()\n.\n

    [link]()

    \n" - "````````````````````````````````\n" -- "\nThe destination can contain `)` if it is enclosed\n" -- "in pointy brackets:\n\n" +- "\nThe destination can contain `)`" +- " if it is enclosed\nin pointy brackets:\n\n" - "````````````````````````````````" - " example\n" - "[a]()\n.\n" @@ -4799,9 +4818,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    link

    \n" - "````````````````````````````````\n" -- "\n\n" -- "(Note: `Markdown.pl` did allow double quotes inside a double-quoted" -- "\ntitle, and its test suite included a test demonstrating this.\n" +- "\n\n(Note: `Markdown.pl`" +- " did allow double quotes inside a double-quoted\n" +- "title, and its test suite included a test demonstrating this.\n" - "But it is hard to see a good rationale for the extra complexity this\n" - "brings, since there are already many ways---backslash escaping,\n" - "entity and numeric character references, or using a different\n" @@ -4908,8 +4927,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo *bar](baz*)\n.\n" - "

    foo *bar

    \n" - "````````````````````````````````\n" -- "\n\nNote that brackets that *aren't* part of links do not take\n" -- "precedence:\n\n" +- "\n\nNote that brackets that *aren't*" +- " part of links do not take\nprecedence:\n\n" - "````````````````````````````````" - " example\n" - "*foo [bar* baz]\n.\n" @@ -4939,18 +4958,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\nThere are three kinds of [reference link](@)s:\n" - "[full](#full-reference-link), [collapsed](#collapsed-reference-link)" - ",\nand [shortcut](#shortcut-reference-link).\n\n" -- "A [full reference link](@)\n" -- "consists of a [link text] immediately followed by a [link label]\n" -- "that [matches] a [link reference definition] elsewhere in the document.\n\n" -- "A [link label](@) begins with a left bracket (`[`)" -- " and ends\nwith the first right bracket (`]`" +- "A [full reference link](@)" +- "\nconsists of a [link text] immediately followed by a [link label" +- "]\nthat [matches] a [link reference definition] elsewhere in the document." +- "\n\nA [link label](@) begins with a left bracket (`[`" +- ") and ends\nwith the first right bracket (`]`" - ") that is not backslash-escaped.\n" - "Between these brackets there must be at least one character that is not a space,\n" - "tab, or line ending.\nUnescaped square bracket characters are not allowed inside the" - "\nopening and closing square brackets of [link labels]. A link\n" -- "label can have at most 999 characters inside the square\nbrackets.\n\n" -- "One label [matches](@)\n" -- "another just in case their normalized forms are equal. To normalize a\n" +- "label can have at most 999 characters inside the square\nbrackets." +- "\n\nOne label [matches](@)" +- "\nanother just in case their normalized forms are equal. To normalize a\n" - "label, strip off the opening and closing brackets,\nperform the " - "*Unicode case fold*" - ", strip leading and trailing\n" @@ -4960,8 +4979,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "matching reference link definitions, the one that comes first in the\n" - "document is used. " - "(It is desirable in such cases to emit a warning.)\n\n" -- "The link's URI and title are provided by the matching [link\n" -- "reference definition].\n\nHere is a simple example:\n" +- "The link's URI and title are provided by the matching [link\nreference definition" +- "].\n\nHere is a simple example:\n" - "\n" - "````````````````````````````````" - " example\n" @@ -4969,8 +4988,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo\n" - "````````````````````````````````\n" -- "\n\nThe rules for the [link text] are the same as with\n" -- "[inline links]. Thus:\n\n" +- "\n\nThe rules for the [link text] are the same as with\n[" +- "inline links]. Thus:\n\n" - "The link text may contain balanced brackets, but not unbalanced ones,\n" - "unless they are escaped:\n\n" - "````````````````````````````````" @@ -5079,9 +5098,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[Foo\n bar]: /url\n\n[Baz][Foo bar]\n.\n" - "

    Baz

    \n" - "````````````````````````````````\n" -- "\n\n" -- "No spaces, tabs, or line endings are allowed between the [link text]" -- " and the\n[link label]:\n\n" +- "\n\nNo spaces, tabs, or line endings are allowed between the [link text" +- "] and the\n[link label]:\n\n" - "````````````````````````````````" - " example\n" - "[foo] [bar]\n\n[bar]: /url \"title\"\n.\n" @@ -5105,9 +5123,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "link text and the link label, then in the following we will have\n" - "a single reference link, not two shortcut reference links, as\nintended:\n\n" - "``` markdown\n[foo]\n[bar]\n\n[foo]: /url1\n" -- "[bar]: /url2\n```\n\n" -- "(Note that [shortcut reference links] were introduced by Gruber\n" -- "himself in a beta version of `Markdown.pl`" +- "[bar]: /url2\n```\n\n(Note that [shortcut reference links]" +- " were introduced by Gruber\nhimself in a beta version of " +- "`Markdown.pl`" - ", but never included\nin the official syntax description. Without shortcut reference\n" - "links, it is harmless to allow space between the link text and\n" - "link label; but once shortcut references are introduced, it is\n" @@ -5164,9 +5182,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[bar\\\\]: /uri\n\n[bar\\\\]\n.\n" - "

    bar\\

    \n" - "````````````````````````````````\n" -- "\n\n" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:\n\n" +- "\n\nA [link label]" +- " must contain at least one character that is not a space, tab, or\n" +- "line ending:\n\n" - "````````````````````````````````" - " example\n" - "[]\n\n[]: /uri\n.\n

    []

    \n" @@ -5178,8 +5196,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [\n" - "]: /uri

    \n" - "````````````````````````````````\n" -- "\n\nA [collapsed reference link](@)\n" -- "consists of a [link label] that [matches] a\n[" +- "\n\nA [collapsed reference link](@)" +- "\nconsists of a [link label] that [matches] a\n[" - "link reference definition] elsewhere in the\ndocument, followed by the string " - "`[]`" - ".\nThe contents of the link label are parsed as inlines,\n" @@ -5209,17 +5227,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    Foo\n" - "````````````````````````````````\n" -- "\n\n\n" -- "As with full reference links, spaces, tabs, or line endings are not\n" -- "allowed between the two sets of brackets:\n\n" +- "\n\n\nAs with full reference links, spaces, tabs, or line endings are not" +- "\nallowed between the two sets of brackets:\n\n" - "````````````````````````````````" - " example\n" - "[foo] \n[]\n\n[foo]: /url \"title\"\n.\n" - "

    foo\n" - "[]

    \n" - "````````````````````````````````\n" -- "\n\nA [shortcut reference link](@)\n" -- "consists of a [link label] that [matches] a\n[" +- "\n\nA [shortcut reference link](@)" +- "\nconsists of a [link label] that [matches] a\n[" - "link reference definition] elsewhere in the\ndocument and is not followed by " - "`[]`" - " or a link label.\n" @@ -5269,17 +5286,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo] bar\n\n[foo]: /url\n.\n" - "

    foo bar

    \n" - "````````````````````````````````\n" -- "\n\n" -- "If you just want bracketed text, you can backslash-escape the\n" -- "opening bracket to avoid links:\n\n" +- "\n\nIf you just want bracketed text, you can backslash-escape the" +- "\nopening bracket to avoid links:\n\n" - "````````````````````````````````" - " example\n" - "\\[foo]\n\n[foo]: /url \"title\"\n.\n" - "

    [foo]

    \n" - "````````````````````````````````\n" -- "\n\n" -- "Note that this is a link, because a link label ends with the first\n" -- "following closing bracket:\n\n" +- "\n\nNote that this is a link, because a link label ends with the first" +- "\nfollowing closing bracket:\n\n" - "````````````````````````````````" - " example\n" - "[foo*]: /url\n\n*[foo*]\n.\n" @@ -5313,17 +5328,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo(not a link)\n" - "````````````````````````````````\n" -- "\n" -- "In the following case `[bar][baz]` is parsed as a reference,\n" -- "`[foo]` as normal text:\n\n" +- "\nIn the following case `[bar][baz]`" +- " is parsed as a reference,\n`[foo]` as normal text:\n\n" - "````````````````````````````````" - " example\n" - "[foo][bar][baz]\n\n[baz]: /url\n.\n" - "

    [foo]bar

    \n" - "````````````````````````````````\n" -- "\n\n" -- "Here, though, `[foo][bar]` is parsed as a reference," -- " since\n`[bar]` is defined:\n\n" +- "\n\nHere, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:\n\n" - "````````````````````````````````" - " example\n" - "[foo][bar][baz]\n\n[baz]: /url1\n" @@ -5331,8 +5344,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobaz

    \n" - "````````````````````````````````\n" -- "\n\n" -- "Here `[foo]` is not parsed as a shortcut reference, because it\n" +- "\n\nHere `[foo]`" +- " is not parsed as a shortcut reference, because it\n" - "is followed by a link label (even though `[bar]` is not defined" - "):\n\n" - "````````````````````````````````" @@ -5343,8 +5356,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "## Images\n\n" -- "Syntax for images is like the syntax for links, with one\ndifference. " -- "Instead of [link text], we have an\n[image description](@)" +- "Syntax for images is like the syntax for links, with one\n" +- "difference. Instead of [link text], we have an\n" +- "[image description](@)" - ". The rules for this are the\nsame as for [link text]" - ", except that (a) an\nimage description starts with `![`" - " rather than `[`" @@ -5504,16 +5518,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \"Foo\"\n" - "````````````````````````````````\n" -- "\n\nIf you just want a literal `!" -- "` followed by bracketed text, you can\n" -- "backslash-escape the opening `[`:\n\n" +- "\n\nIf you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening " +- "`[`:\n\n" - "````````````````````````````````" - " example\n" - "!\\[foo]\n\n[foo]: /url \"title\"\n.\n

    ![" - "foo]

    \n" - "````````````````````````````````\n" -- "\n\nIf you want a link after a literal `!" -- "`, backslash-escape the\n`!`:\n\n" +- "\n\nIf you want a link after a literal `!`" +- ", backslash-escape the\n`!`:\n\n" - "````````````````````````````````" - " example\n" - "\\![foo]\n\n[foo]: /url \"title\"\n.\n

    !" @@ -5528,15 +5542,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A [URI autolink](@) consists of `<`, followed by an" - "\n[absolute URI] followed by `>`" - ". It is parsed as\n" -- "a link to the URI, with the URI as the link's label.\n\n" -- "An [absolute URI](@),\n" -- "for these purposes, consists of a [scheme] followed by a colon (`:`" +- "a link to the URI, with the URI as the link's label." +- "\n\nAn [absolute URI](@)" +- ",\nfor these purposes, consists of a [scheme] followed by a colon (" +- "`:`" - ")\nfollowed by zero or more characters other than [ASCII control\ncharacters][" - "ASCII control character], [space], `<`, and `>`" - ".\nIf the URI includes these characters, they must be percent-encoded\n" - "(e.g. `%20` for a space).\n\n" -- "For purposes of this spec, a [scheme](@) is any sequence\n" -- "of 2--32 characters beginning with an ASCII letter and followed\n" +- "For purposes of this spec, a [scheme](@)" +- " is any sequence\nof 2--" +- "32 characters beginning with an ASCII letter and followed\n" - "by any combination of ASCII letters, digits, or the symbols plus\n(\"+\"" - "), period (\".\"), or hyphen (\"-\").\n\n" - "Here are some valid autolinks:\n" @@ -5617,16 +5633,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    https://example.com/\\[\\

    \n" - "````````````````````````````````\n" -- "\n\nAn [email autolink](@)\n" -- "consists of `<`, followed by an [email address],\nfollowed by " -- "`>`" +- "\n\nAn [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is " - "`mailto:` followed by the email address.\n\n" -- "An [email address](@),\nfor these purposes, is anything that matches\n" -- "the " -- "[non-normative regex from the HTML5\n" -- "spec](https://html.spec.whatwg.org/multipage/forms.html#e" -- "-mail-state-(type=email)):\n\n " +- "An [email address](@)" +- ",\nfor these purposes, is anything that matches\nthe " +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org/multipage/forms.html#e-mail" +- "-state-(type=email)):\n\n " - "/^[a-zA-Z0-9.!#$%&'*+/=?" - "^_`{|}~-]+@[a-zA-Z0-9](?:" - "[a-zA-Z0-9-]{0,61}[a-zA-Z0" @@ -5696,55 +5711,55 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Tag and attribute names are not limited to current HTML tags,\n" - "so custom tags (and even, say, DocBook tags) may be used" - ".\n\nHere is the grammar for tags:\n" -- "\nA [tag name](@) consists of an ASCII letter\n" +- "\nA [tag name](@)" +- " consists of an ASCII letter\n" - "followed by zero or more ASCII letters, digits, or\n" - "hyphens (`-`).\n\n" - "An [attribute](@) consists of spaces, tabs, and up to one" - " line ending,\nan [attribute name], and an optional\n[attribute value specification" - "].\n\n" -- "An [attribute name](@)\n" -- "consists of an ASCII letter, `_`, or `:`, followed by zero" -- " or more ASCII\nletters, digits, `_`, `.`, `:`" -- ", or `-`" +- "An [attribute name](@)\nconsists of an ASCII letter, `_`" +- ", or `:`" +- ", followed by zero or more ASCII\nletters, digits, `_`, `.`" +- ", `:`, or `-`" - ". (Note: This is the XML\n" - "specification restricted to ASCII. HTML5 is laxer.)\n\n" -- "An [attribute value specification](@)\n" -- "consists of optional spaces, tabs, and up to one line ending,\n" +- "An [attribute value specification](@)" +- "\nconsists of optional spaces, tabs, and up to one line ending,\n" - "a `=` character, optional spaces, tabs, and up to one line ending" - ",\nand an [attribute value].\n\n" -- "An [attribute value](@)\n" -- "consists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value].\n\n" -- "An [unquoted attribute value](@)\n" -- "is a nonempty string of characters not\n" +- "An [attribute value](@)" +- "\nconsists of an [unquoted attribute value],\na [" +- "single-quoted attribute value], or a [double-quoted attribute value]." +- "\n\nAn [unquoted attribute value](@)" +- "\nis a nonempty string of characters not\n" - "including spaces, tabs, line endings, `\"`, `'`, `=`, `<" - "`, `>`, or `` ` ``.\n\n" -- "A [single-quoted attribute value](@)\n" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`.\n\n" -- "A [double-quoted attribute value](@)\n" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`.\n\n" +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final `'`.\n\n" +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final `\"`.\n\n" - "An [open tag](@) consists of a `<` character, a [" - "tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional " - "`/` character, and a `>` character.\n\n" -- "A [closing tag](@) consists of the string ``.\n\n" +- "A [closing tag](@) consists of the string ``.\n\n" - "An [HTML comment](@) consists of ``, `" - "`, or ``, and `-->` (see the\n" - "[HTML spec](https://html.spec.whatwg.org/multipage/" - "parsing.html#markup-declaration-open-state)).\n\n" -- "A [processing instruction](@)\nconsists of the string ``" +- "A [processing instruction](@)\nconsists of the string ``" - ", and the string\n`?>`.\n\n" -- "A [declaration](@) consists of the string ``" +- "A [declaration](@) consists of the string ``" - ", and the character `>`.\n\n" -- "A [CDATA section](@) consists of\nthe string ``" +- "A [CDATA section](@) consists of\nthe string ``" - ", and the string `]]>`.\n\n" - "An [HTML tag](@) consists of an [open tag], a [" - "closing tag],\nan [HTML comment], a [processing instruction], a [declaration" @@ -5901,8 +5916,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\nfoo \nbaz\n.\n

    foo
    \nbaz

    \n" - "````````````````````````````````\n" -- "\n\nFor a more visible alternative, a backslash before the\n" -- "[line ending] may be used instead of two or more spaces:\n\n" +- "\n\nFor a more visible alternative, a backslash before the\n[line ending]" +- " may be used instead of two or more spaces:\n\n" - "````````````````````````````````" - " example\nfoo\\\nbaz\n.\n

    foo
    \nbaz

    \n" - "````````````````````````````````\n" @@ -5997,9 +6012,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\nfoo \n baz\n.\n

    foo\nbaz

    \n" - "````````````````````````````````\n" -- "\n\n" -- "A conforming parser may render a soft line break in HTML either as a\n" -- "line ending or as a space.\n\n" +- "\n\nA conforming parser may render a soft line break in HTML either as a" +- "\nline ending or as a space.\n\n" - "A renderer may also provide an option to render soft line breaks\n" - "as hard line breaks.\n\n" - "## Textual content\n\n" @@ -6033,8 +6047,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "2. In the second phase, the raw text contents of paragraphs and headings\n" - "are parsed into sequences of Markdown inline elements (strings,\n" - "code spans, links, emphasis, and so on), using the map of link" -- "\nreferences constructed in phase 1.\n\n" -- "At each point in processing, the document is represented as a tree of\n" +- "\nreferences constructed in phase 1." +- "\n\nAt each point in processing, the document is represented as a tree of\n" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as " - "**children**" @@ -6053,16 +6067,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " -> list_item\n -> paragraph\n \"aliquando id\"\n" - "```\n\n" - "## Phase 1: block structure\n\n" -- "Each line that is processed has an effect on this tree. " -- "The line is\n" -- "analyzed and, depending on its contents, the document may be altered\n" +- Each line that is processed has an effect on this tree. The line is +- "\nanalyzed and, depending on its contents, the document may be altered\n" - "in one or more of the following ways:\n\n" -- "1. One or more open blocks may be closed.\n2. " -- "One or more new blocks may be created as children of the\n " -- "last open block.\n" -- "3. " -- "Text may be added to the last (deepest) open block remaining\n " -- "on the tree.\n\n" +- 1. One or more open blocks may be closed. +- "\n2. One or more new blocks may be created as children of the" +- "\n last open block.\n" +- 3. Text may be added to the last (deepest) open block remaining +- "\n on the tree.\n\n" - "Once a line has been incorporated into the tree in this way,\n" - "it can be discarded, so input can be read in a stream.\n\n" - "For each line, we follow this procedure:\n\n" @@ -6074,13 +6086,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " character. A paragraph requires a non-blank line.\n" - "In this phase we may match all or just some of the open\n" - "blocks. But we cannot close unmatched blocks yet, because we may have a" -- "\n[lazy continuation line].\n\n" -- "2. Next, after consuming the continuation markers for existing\n" +- "\n[lazy continuation line]." +- "\n\n2. Next, after consuming the continuation markers for existing\n" - "blocks, we look for new block starts (e.g. `>` for a" - " block quote).\nIf we encounter a new block start, we close any blocks unmatched" - "\nin step 1 before creating the new block as a child of the last" -- "\nmatched container block.\n\n" -- "3. " +- "\nmatched container block." +- "\n\n3. " - "Finally, we look at the remainder of the line (after block\n" - "markers like `>`" - ", list markers, and indentation have been consumed).\n" @@ -6099,18 +6111,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "At the outset, our document model is just\n" - "\n``` tree\n-> document\n```\n\nThe first line of our text,\n" - "\n``` markdown\n> Lorem ipsum dolor\n```" -- "\n\n" -- "causes a `block_quote` block to be created as a child of our" -- "\nopen `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" +- "\n\ncauses a `block_quote`" +- " block to be created as a child of our\nopen `document`" +- " block, and a `paragraph` block as a child of\nthe " +- "`block_quote`" - ". Then the text is added to the last open\nblock, the " - "`paragraph`:\n\n" - "``` tree\n-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\"\n```\n\nThe next line,\n" - "\n``` markdown\nsit amet.\n```" -- "\n\n" -- "is a \"lazy continuation\" of the open `paragraph`, so it gets added" -- "\nto the paragraph's text:\n\n" +- "\n\nis a \"lazy continuation\" of the open `paragraph`" +- ", so it gets added\nto the paragraph's text:\n\n" - "``` tree\n-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n```\n\nThe third line,\n" - "\n``` markdown\n> - Qui *quodsi iracundia*\n" @@ -6126,9 +6137,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " -> list_item\n -> paragraph\n" - " \"Qui *quodsi iracundia*\"\n```\n\n" - "The fourth line,\n\n``` markdown\n> - aliquando id\n```" -- "\n\n" -- "causes the `list_item` (and its child the `paragraph`) to" -- " be closed,\nand a new `list_item`" +- "\n\ncauses the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" - " opened up as child of the `list`. A `paragraph`" - "\nis added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:\n\n" @@ -6164,17 +6174,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "we insert a text node with these symbols as its literal content, and we\n" - "add a pointer to this text node to the [delimiter stack](@).\n\n" - "The [delimiter stack] is a doubly linked list. Each\n" -- "element contains a pointer to a text node, plus information about\n\n" -- "- the type of delimiter (`[`, `![`, `*`, `_`)\n" -- "- the number of delimiters,\n" +- "element contains a pointer to a text node, plus information about" +- "\n\n- the type of delimiter (`[`, `![" +- "`, `*`, `_`)\n- the number of delimiters,\n" - "- whether the delimiter is \"active\" (all are active to start), and" -- "\n- whether the delimiter is a potential opener, a potential closer,\n" -- " or both (which depends on what sort of characters precede\n " +- "\n- whether the delimiter is a potential opener, a potential closer,\n " +- "or both (which depends on what sort of characters precede\n " - "and follow the delimiters).\n\n" - "When we hit a `]` character, we call the *look for link" -- " or image*\nprocedure (see below).\n\n" -- "When we hit the end of the input, we call the *process emphasis*\n" -- "procedure (see below), with `stack_bottom` = NULL.\n\n" +- " or image*\nprocedure (see below)." +- "\n\nWhen we hit the end of the input, we call the " +- "*process emphasis*\nprocedure (see below), with `stack_bottom`" +- " = NULL.\n\n" - "#### *look for link or image*\n\n" - "Starting at the top of the delimiter stack, we look backwards\n" - "through the stack for an opening `[` or `![` delimiter.\n\n" @@ -6185,58 +6196,69 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`]`.\n\n" - "- If we find one and it's active, then we parse ahead to see" - " if\n we have an inline link/image, reference link/image, collapsed reference" -- "\n link/image, or shortcut reference link/image.\n\n " -- "+ If we don't, then we remove the opening delimiter from the\n" -- " delimiter stack and return a literal text node `]`.\n\n " -- "+ If we do, then\n\n" -- " * We return a link or image node whose children are the inlines\n" -- " after the text node pointed to by the opening delimiter.\n\n " -- "* We run *process emphasis* on these inlines, with the `[`" -- " opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n" +- "\n link/image, or shortcut reference link/image." +- "\n\n + If we don't, then we remove the opening delimiter from the" +- "\n delimiter stack and return a literal text node `]`.\n\n " +- "+ If we do, then\n" +- "\n * We return a link or image node whose children are the inlines" +- "\n after the text node pointed to by the opening delimiter." +- "\n\n * " +- "We run *process emphasis* on these inlines, with the `[` opener" +- "\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n" - " * If we have a link (and not an image), we also set" - " all\n `[` delimiters before the opening delimiter to *inactive*" - ". (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\n" -- "Parameter `stack_bottom` sets a lower bound to how far we\n" -- "descend in the [delimiter stack]. If it is NULL, we can" -- "\ngo all the way to the bottom. Otherwise, we stop before\n" +- "Parameter `stack_bottom`" +- " sets a lower bound to how far we\ndescend in the [delimiter stack" +- "]. If it is NULL, we can\n" +- "go all the way to the bottom. Otherwise, we stop before\n" - "visiting `stack_bottom`.\n\n" -- "Let `current_position` point to the element on the [delimiter stack]\n" -- "just above `stack_bottom` (or the first element if `stack_bottom`" -- "\nis NULL).\n\n" -- "We keep track of the `openers_bottom` for each delimiter\n" -- "type (`*`, `_`), indexed to the length of the closing delimiter run" -- "\n(modulo 3) and to whether the closing delimiter can also be an" -- "\nopener. Initialize this to `stack_bottom`.\n\n" +- "Let `current_position`" +- " point to the element on the [delimiter stack]\njust above `stack_bottom`" +- " (or the first element if `stack_bottom`\nis NULL).\n\n" +- "We keep track of the `openers_bottom` for each delimiter\ntype (" +- "`*`, `_`" +- "), indexed to the length of the closing delimiter run\n" +- "(modulo 3) and to whether the closing delimiter can also be an\n" +- "opener. Initialize this to `stack_bottom`.\n\n" - "Then we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if needed)\n" -- " until we find the first potential closer with delimiter `*` or `_`" +- "- " +- "Move `current_position`" +- " forward in the delimiter stack (if needed)\n " +- "until we find the first potential closer with delimiter `*` or `_`" - ".\n (This will be the potential closer closest\n " -- "to the beginning of the input -- the first one in parse order.)\n\n" -- "- Now, look back in the stack (staying above `stack_bottom`" -- " and\n the `openers_bottom`" +- to the beginning of the input -- the first one in parse order.) +- "\n\n- " +- "Now, look back in the stack (staying above `stack_bottom` and" +- "\n the `openers_bottom`" - " for this delimiter type) for the\n first matching potential opener (\"matching\"" - " means same delimiter).\n\n- If one is found:\n\n " -- "+ Figure out whether we have emphasis or strong emphasis:\n" -- " if both closer and opener spans have length >= 2, we have" -- "\n strong, otherwise regular.\n\n " -- "+ Insert an emph or strong emph node accordingly, after\n" -- " the text node corresponding to the opener.\n\n " -- "+ Remove any delimiters between the opener and closer from\n" -- " the delimiter stack.\n\n " -- + Remove 1 (for regular emph) or 2 (for strong emph -- ") delimiters\n " +- "+ Figure out whether we have emphasis or strong emphasis:\n " +- "if both closer and opener spans have length >= 2, we have\n " +- "strong, otherwise regular.\n\n " +- "+ Insert an emph or strong emph node accordingly, after\n " +- "the text node corresponding to the opener.\n\n " +- "+ Remove any delimiters between the opener and closer from\n " +- the delimiter stack. +- "\n\n + " +- Remove 1 (for regular emph) or 2 (for strong emph) +- " delimiters\n " - "from the opening and closing text nodes. If they become empty\n " - "as a result, remove them and remove the corresponding element\n " - "of the delimiter stack. If the closing node is removed, reset\n " - "`current_position` to the next element in the stack.\n\n" -- "- If none is found:\n\n" -- " + Set `openers_bottom` to the element before `current_position`.\n" -- " (We know that there are no openers for this kind of closer up" -- " to and\n " -- "including this point, so this puts a lower bound on future searches.)\n\n " -- "+ If the closer at `current_position` is not a potential opener,\n" -- " remove it from the delimiter stack (since we know it can't\n " +- "- If none is found:\n" +- "\n + " +- "Set `openers_bottom` to the element before `current_position`" +- ".\n " +- (We know that there are no openers for this kind of closer up to +- " and\n " +- "including this point, so this puts a lower bound on future searches.)" +- "\n\n + " +- "If the closer at `current_position`" +- " is not a potential opener,\n " +- "remove it from the delimiter stack (since we know it can't\n " - "be a closer either).\n\n " - "+ Advance `current_position` to the next element in the stack.\n\n" - "After we're done, we remove all delimiters above `stack_bottom` from" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap index 0d0d1be..6dc958a 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap @@ -65,8 +65,8 @@ input_file: tests/inputs/markdown/github_flavored.md - " + Facilisis in pretium nisl aliquet\n" - " - Nulla volutpat aliquam velit\n+ Very easy!\n" - "```\n\n" -- "1. First ordered list item\n" -- "2. Another item\n⋅⋅*" +- 1. First ordered list item +- "\n2. Another item\n⋅⋅*" - " Unordered sub-list.\n" - "1. Actual numbers don't matter, just that it's a number\n" - "⋅⋅1. Ordered sub-list\n" @@ -167,7 +167,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "[id]: https://octodex.github.com/images/dojocat.jpg " - "\"The Dojocat\"\n```\n\n" - "Here's our logo (hover to see the title text):\n" -- "\nInline-style:\n![" +- "\nInline-style:\n" +- "![" - "alt text](https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 1\")\n\n" - "Reference-style:\n![alt text][logo]\n" @@ -191,8 +192,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Inline footnote^[Text of inline footnote] definition.\n\n" - "Duplicated footnote reference[^second].\n\n" - "[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n" -- "[^second]: Footnote text.\n```\n\n" -- "Footnote 1 link[^first].\n\nFootnote 2 link[^second].\n" +- "[^second]: Footnote text.\n```\n\nFootnote 1 link" +- "[^first].\n\nFootnote 2 link[^second].\n" - "\nInline footnote^[Text of inline footnote] definition.\n" - "\nDuplicated footnote reference[^second].\n" - "\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n" @@ -313,18 +314,18 @@ input_file: tests/inputs/markdown/github_flavored.md - "> Blockquotes can also be nested...\n" - ">> ...by using additional greater-than signs right next to each other...\n" - "> > > ...or with spaces between arrows.\n```\n\n" -- "> Blockquotes are very handy in email to emulate reply text.\n" -- "> This line is part of the same quote.\n\nQuote break.\n\n" +- "> Blockquotes are very handy in email to emulate reply text.\n> " +- "This line is part of the same quote.\n\nQuote break.\n\n" - "> This is a very long line that will still be quoted properly when it wraps" - ". Oh boy let'" - s keep writing to make sure this is long enough to actually wrap for everyone. - " Oh, you can *put* **Markdown** into a blockquote.\n\n" -- "> Blockquotes can also be nested...\n" -- ">" -- "> ...by using additional greater-than signs right next to each other...\n" -- "> > > ...or with spaces between arrows.\n\n------\n\n" +- "> Blockquotes can also be nested...\n>" +- "> ...by using additional greater-than signs right next to each other..." +- "\n> > > ...or with spaces between arrows.\n\n------\n\n" - "# Inline HTML\n\n" -- "```\n
    \n
    Definition list
    \n" +- "```\n" +- "
    \n
    Definition list
    \n" - "
    Is something people use sometimes.
    \n\n" - "
    Markdown in HTML
    \n" - "
    Does *not* work **very** well. " @@ -335,11 +336,13 @@ input_file: tests/inputs/markdown/github_flavored.md - "
    Does *not* work **very** well. " - "Use HTML tags.
    \n
    \n\n------\n\n" - "# Horizontal Rules\n\n" -- "```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n" -- "___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" +- "```\n" +- "Three or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\n" +- "Underscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n" +- "\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" - "# YouTube Videos\n\n" -- "```\n\n" - "\"IMAGE\n\n" -- "\n```\n[![" +- "\n```\n" +- "[![" - "IMAGE ALT TEXT HERE](http://img.youtube.com/vi/" - "YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com" - "/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n" -- "[![" -- "IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e" -- /ef/YouTube_logo_2015.svg/1200px- -- "YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?" +- "[![IMAGE ALT TEXT HERE" +- "](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/" +- YouTube_logo_2015.svg/1200px-YouTube_logo_2015 +- ".svg.png)](https://www.youtube.com/watch?" - "v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap index dfb8d71..fc5a101 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap @@ -10,10 +10,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\nThis paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:" - "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n This paragraph belongs to item two of the outer list.\n```\n\nThe AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document." - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank" -- " lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)" -- "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```" +- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)" +- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```" +- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```" +- "11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```" - "In the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away." - "## About this document\n\nThis document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\nof representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer." - "Note that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\nimplementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\npercent-encode non-ASCII characters in URLs.\n\nThis document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs." @@ -85,9 +86,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n````;\n````\n.\n

    \n````````````````````````````````\n\n\n[Info strings] for backtick code blocks cannot contain backticks:\n\n```````````````````````````````` example\n``` aa ```\nfoo\n.\n

    aa\nfoo

    \n````````````````````````````````\n\n\n[Info strings] for tilde code blocks can contain backticks and tildes:\n\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
    foo\n
    \n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:" - "```````````````````````````````` example\n```\n``` aaa\n```\n.\n
    ``` aaa\n
    \n````````````````````````````````" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line." -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n``, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``.\n\n6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line]." -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n``, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``." +- "6." +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" +- " line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n\n```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "In this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n\nAll types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
    \n hi\n
    \n\nokay.\n.\n\n \n \n \n
    \n hi\n
    \n

    okay.

    \n````````````````````````````````" - "```````````````````````````````` example\n \n*foo*\n````````````````````````````````\n\n\nHere we have two HTML blocks with a Markdown paragraph between them:\n\n```````````````````````````````` example\n
    \n\n*Markdown*\n\n
    \n.\n
    \n

    Markdown

    \n
    \n````````````````````````````````\n\n\nThe tag on the first line can be partial, as long\nas it is split where there would be whitespace:" @@ -136,8 +138,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n> > > foo\nbar\n.\n
    \n
    \n
    \n

    foo\nbar

    \n
    \n
    \n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
    \n
    \n
    \n

    foo\nbar\nbaz

    \n
    \n
    \n
    \n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:" - "```````````````````````````````` example\n> code\n\n> not code\n.\n
    \n
    code\n
    \n
    \n
    \n

    not code

    \n
    \n````````````````````````````````" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if" -- " the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" +- " 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." - "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n````````````````````````````````" - "The most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n\nHere are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" - "```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • one
    • \n
    \n
     two\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````\n\n\nIt is tempting to think of this in terms of columns: the continuation\nblocks must be indented at least to the column of the first character other than\na space or tab after the list marker. However, that is not quite right.\nThe spaces of indentation after the list marker determine how much relative\nindentation is needed. Which column this indentation reaches will depend on\nhow the list item is embedded in other constructions, as shown by\nthis example:" @@ -200,25 +202,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "First, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by\na non-backslash-escaped `_` character.\n\nA [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace." - "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff" -- " it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run]." +- "6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3." - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis." -- "So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*
    foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" -- "This is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:" -- "```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````\n\n\nRule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:" -- "```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:" -- "```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:" -- "```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:" -- "```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:" -- "```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````" -- "```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````" -- "This is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:\n\n```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:" -- "```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````" -- "```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````\n\n\nThis is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:" -- "```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith these examples:" -- "```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`." +- "17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`." +- "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:" +- "```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````" +- "Rule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````" +- "```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:\n\n```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:" +- "```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````" +- "This is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n\n```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:" +- "```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````" +- "```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````" +- "This is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:" +- "```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````" +- "Intraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````" +- "This is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````" +- "The point of this restriction is more easily appreciated\nwith these examples:\n\n```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:" - "```````````````````````````````` example\n**foo**bar\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 8:\n\nThis is not strong emphasis, because the closing delimiter is\npreceded by whitespace:\n\n```````````````````````````````` example\n__foo bar __\n.\n

    __foo bar __

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the second `__` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n__(__foo)\n.\n

    __(__foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:" - "```````````````````````````````` example\n_(__foo__)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\n__foo__bar\n.\n

    __foo__bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__пристаням__стремятся\n.\n

    __пристаням__стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo__bar__baz__\n.\n

    foo__bar__baz

    \n````````````````````````````````" - "This is strong emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n__(bar)__.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 9:\n\nAny nonempty sequence of inline elements can be the contents of an\nemphasized span.\n\n```````````````````````````````` example\n*foo [bar](/url)*\n.\n

    foo bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n*foo\nbar*\n.\n

    foo\nbar

    \n````````````````````````````````\n\n\nIn particular, emphasis and strong emphasis can be nested\ninside emphasis:" @@ -328,5 +330,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter." - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:" +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap index ce42f0d..02ca633 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap @@ -29,8 +29,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - What distinguishes Markdown from many other lightweight markup - "syntaxes, which are often easier to write, is its readability." - "As Gruber writes:" -- "> The overriding design goal for Markdown's formatting syntax is" -- "> to make it as readable as possible. The idea is that a\n>" +- "> The overriding design goal for Markdown's formatting syntax is\n>" +- "to make it as readable as possible. The idea is that a\n>" - "Markdown-formatted document should be publishable as-is, as\n>" - "plain text, without looking like it's been marked up with tags\n>" - "or formatting instructions.\n> (" @@ -60,39 +60,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " This paragraph is part of the preceding list item.\n\n 2." - "List item b.\n\n This paragraph belongs to item two of the outer list." - "```" -- "The AsciiDoc version is, arguably, easier to write." -- "You don't need" -- to worry about indentation. But the Markdown version is much easier +- "The AsciiDoc version is, arguably, easier to write. You don'" +- "t need\nto worry about indentation. But the Markdown version is much easier" - to read. The nesting of list items is apparent to the eye in the - "source, not just in the processed document." - "## Why is a spec needed?" -- "John Gruber's [canonical description of Markdown's" -- "syntax](https://daringfireball.net/projects/markdown/syntax)" +- "John Gruber's" +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown/syntax)" - does not specify the syntax unambiguously. Here are some examples of - "questions it does not answer:" -- 1. How much indentation is needed for a sublist? -- The spec says that +- "1." +- How much indentation is needed for a sublist? The spec says that - "continuation paragraphs need to be indented four spaces, but is" - not fully explicit about sublists. It is natural to think that - "they, too, must be indented four spaces, but `Markdown.pl`" - "does\n not require that. This is hardly a \"corner case,\"" - " and divergences\n between implementations on this issue often lead to surprises for" - users in real documents. (See -- "[this comment by John" -- "Gruber](https://web.archive.org/web/20170611172104/http" -- "://article.gmane.org/gmane.text.markdown.general/1997" -- ).) -- 2. Is a blank line needed before a block quote or heading? +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/20170611172104/http://" +- article.gmane.org/gmane.text.markdown.general/1997). +- ")\n\n2. Is a blank line needed before a block quote or heading?" - "Most implementations do not require the blank line. However," - "this can lead to unexpected results in hard-wrapped text, and" - also to ambiguities in parsing (note that some implementations - "put the heading inside the blockquote, while others do not)." - (John Gruber has also spoken -- "[in favor of requiring the blank" -- "lines](https://web.archive.org/web/20170611172104/http://" +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/20170611172104/http://" - article.gmane.org/gmane.text.markdown.general/2146). -- ) -- 3. Is a blank line needed before an indented code block? +- ")\n\n3. Is a blank line needed before an indented code block?" - "(`Markdown.pl`" - "requires it, but this is not mentioned in the" - "documentation, and some implementations do not require it.)" @@ -137,8 +135,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown" - "- `a long code span can contain a hyphen like this" - " - and it can screw things up`\n ```" -- "11. Can list items include section headings? (`Markdown.pl` does not" -- "allow this, but does allow blockquotes to include headings.)" +- "11." +- "Can list items include section headings? (`Markdown.pl`" +- " does not\n allow this, but does allow blockquotes to include headings.)" - "``` markdown\n - # Heading\n ```" - "12. Can list items be empty?\n\n ``` markdown\n * a" - " *\n * b\n ```" @@ -154,8 +153,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "was quite buggy, and" - "gave manifestly bad results in many cases, so it was not a" - satisfactory replacement for a spec. -- "Because there is no unambiguous spec, implementations have diverged\nconsiderably." -- "As a result, users are often surprised to find that" +- "Because there is no unambiguous spec, implementations have diverged" +- "considerably. As a result, users are often surprised to find that" - "a document that renders one way on one system (say, a GitHub wiki)" - "renders differently on another (say, converting to docbook using" - "pandoc). To make matters worse, because nothing in Markdown counts" @@ -173,8 +172,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "of representing the structural distinctions we need to make, and the" - choice of HTML for the tests makes it possible to run the tests against - an implementation without writing an abstract syntax tree renderer. -- "Note that not every feature of the HTML samples is mandated by\nthe spec." -- "For example, the spec says what counts as a link" +- Note that not every feature of the HTML samples is mandated by +- "the spec. For example, the spec says what counts as a link" - "destination, but it doesn't mandate that non-ASCII characters in" - "the URL be percent-encoded. To use the automatic tests," - implementers will need to provide a renderer that conforms to @@ -182,49 +181,52 @@ input_file: tests/inputs/markdown/commonmark_spec.md - non-ASCII characters in URLs). But a conforming implementation - can use a different renderer and may choose not to - percent-encode non-ASCII characters in URLs. -- "This document is generated from a text file, `spec.txt`, written" -- "in Markdown with a small extension for the side-by-side tests.\nThe script" -- "`tools/makespec.py` can be used to convert `spec.txt`" -- "into\nHTML or CommonMark (which can then be converted into other formats)." +- "This document is generated from a text file, `spec.txt`" +- ", written\nin Markdown with a small extension for the side-by-side tests." +- "The script `tools/makespec.py` can be used to convert `" +- "spec.txt` into" +- HTML or CommonMark (which can then be converted into other formats). - "In the examples, the `→` character is used to represent tabs." - "# Preliminaries" - "## Characters and lines" - "Any sequence of [characters] is a valid CommonMark\ndocument." -- "A [character](@) is a Unicode code point. Although some" +- "A [character](@)" +- is a Unicode code point. Although some - "code points (for example, combining accents) do not correspond to" - "characters in an intuitive sense, all code points count as characters" - for purposes of this spec. -- This spec does not specify an encoding; it thinks of lines as composed -- "of [characters] rather than bytes. A conforming parser may be limited" +- "This spec does not specify an encoding; it thinks of lines as composed\nof" +- "[characters] rather than bytes. A conforming parser may be limited" - to a certain encoding. -- "A [line](@) is a sequence of zero or more [characters]" -- "other than line feed (`U+000A`) or carriage return (`U+" -- "000D`),\nfollowed by a [line ending]" -- or by the end of file. -- "A [line ending](@) is a line feed (`U+000A" -- "`), a carriage return\n(`U+000D`" -- ") not followed by a line feed, or a carriage return and a" -- following line feed. -- "A line containing no characters, or a line containing only spaces" -- "(`U+0020`) or tabs (`U+0009`), is" +- "A [line](@)" +- " is a sequence of zero or more [characters]\nother than line feed (" +- "`U+000A`) or carriage return (`U+000D`" +- "),\nfollowed by a [line ending] or by the end of file." +- "A [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`) not followed by a line feed, or a carriage" +- "return and a\nfollowing line feed." +- "A line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (`U+0009`), is" - "called a [blank line](@)." - "The following definitions of character classes will be used in this spec:" -- "A [Unicode whitespace character](@) is a character in the Unicode `" -- "Zs` general\ncategory, or a tab (`U+0009`" -- "), line feed (`U+000A`), form feed (`U+" -- "000C`), or\ncarriage return (`U+000D`)." -- "[Unicode whitespace](@) is a sequence of one or more" -- "[Unicode whitespace characters]." +- "A [Unicode whitespace character](@) is a character in the Unicode" +- "`Zs` general\ncategory, or a tab (" +- "`U+0009`), line feed (`U+000A`)," +- "form feed (`U+000C`), or\ncarriage return (" +- "`U+000D`)." +- "[Unicode whitespace](@)" +- " is a sequence of one or more\n[Unicode whitespace characters]." - "A [tab](@) is `U+0009`." - "A [space](@) is `U+0020`." -- "An [ASCII control character](@) is a character between `U+" -- "0000–1F` (both\nincluding) or" +- "An [ASCII control character](@) is a character between" +- "`U+0000–1F` (both\nincluding) or" - "`U+007F`." -- "An [ASCII punctuation character](@)\nis `!" -- "`, `\"`, `#`, `$`, `%`, `&`, `'`, `(" -- "`, `)`,\n`*`, `+`, `,`, `-`, `.`," -- "`/` (U+0021–2F), \n`:`," -- "`;`, `<`, `=`, `>`, `?`, `@`" +- "An [ASCII punctuation character](@)\nis `!`, `\"`," +- "`#`, `$`, `%`, `&`, `'`, `(`, `" +- ")`,\n`*`, `+`, `,`, `-`, `.`, `/`" +- " (U+0021–2F), \n`:`, `;`," +- "`<`, `=`, `>`, `?`, `@`" - " (U+003A–0040),\n`[`, `\\`," - "`]`, `^`, `_`, `` ` `` (U+005B–" - "0060), \n`{`, `|`, `}`, or `~`" @@ -272,7 +274,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n
      bar"
     - "
    \n\n" - "````````````````````````````````" -- "Normally the `>` that begins a block quote may be followed" +- "Normally the `>`" +- that begins a block quote may be followed - "optionally by a space, which is not considered part of the" - "content. In the following case `>`" - "is followed by a tab," @@ -310,8 +313,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "example\n*→*→*→\n.\n
    " - "````````````````````````````````" - "## Insecure characters" -- "For security reasons, the Unicode character `U+0000` must be replaced" -- "with the REPLACEMENT CHARACTER (`U+FFFD`)." +- "For security reasons, the Unicode character `U+0000`" +- " must be replaced\nwith the REPLACEMENT CHARACTER (`U+FFFD`)." - "## Backslash escapes\n\nAny ASCII punctuation character may be backslash-escaped:" - "````````````````````````````````" - example @@ -412,9 +415,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Conforming CommonMark parsers need not store information about - whether a particular character was represented in the source - using a Unicode character or an entity reference. -- "[Entity references](@) consist of `&` + any of the valid" -- "HTML5 entity names + `;`. The\ndocument" -- "" +- "[Entity references](@) consist of `&`" +- " + any of the valid\nHTML5 entity names + `;`" +- ". The\ndocument " - is used as an authoritative source for the valid entity - references and their corresponding code points. - "````````````````````````````````" @@ -426,9 +429,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

      & © Æ Ď" - "¾ ℋ ⅆ\n∲ ≧̸

    " - "````````````````````````````````" -- "[Decimal numeric character\nreferences](@)" -- "consist of `&#` + a string of 1--7 arabic" -- "digits + `;`" +- "[Decimal numeric character\nreferences](@)\nconsist of `&#`" +- "+ a string of 1--7 arabic digits + `;`" - ". A\nnumeric character reference is parsed as the corresponding" - Unicode character. Invalid Unicode code points will be replaced by - "the REPLACEMENT CHARACTER (`U+FFFD`" @@ -439,9 +441,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Ӓ Ϡ �\n." - "

    # Ӓ Ϡ �

    " - "````````````````````````````````" -- "[Hexadecimal numeric character\nreferences](@) consist of `&#` +" -- "either `X` or `x` + a string of 1-6" -- "hexadecimal digits + `;`" +- "[Hexadecimal numeric character\nreferences](@) consist of `&#`" +- " +\neither `X` or `x`" +- "+ a string of 1-6 hexadecimal digits + `;`" - ".\nThey too are parsed as the corresponding Unicode character (this" - time specified with a hexadecimal numeral instead of decimal). - "````````````````````````````````" @@ -540,8 +542,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    [a](url "tit")

    " - "````````````````````````````````" - "# Blocks and inlines" -- We can think of a document as a sequence of -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block" - "quotations, lists, headings, rules, and code blocks." - Some blocks (like @@ -550,8 +551,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "content---text," - "links, emphasized text, images, code spans, and so on." - "## Precedence" -- "Indicators of block structure always take precedence over indicators\nof inline structure." -- "So, for example, the following is a list with" +- Indicators of block structure always take precedence over indicators +- "of inline structure. So, for example, the following is a list with" - "two items, not a list with one item containing a code span:" - "````````````````````````````````" - example @@ -696,8 +697,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n####### foo\n.\n

    ####### foo

    " - "````````````````````````````````" -- "At least one space or tab is required between the `#` characters and the" -- "heading's contents, unless the heading is empty. Note that many" +- "At least one space or tab is required between the `#`" +- " characters and the\nheading'" +- "s contents, unless the heading is empty. Note that many" - "implementations currently do not require the space. However, the" - space was required by the - "[original ATX implementation](http://www.aaronsw.com/2002" @@ -755,7 +757,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n### foo ### \n.\n

    foo

    " - "````````````````````````````````" -- "A sequence of `#` characters with anything but spaces or tabs following it" +- "A sequence of `#`" +- characters with anything but spaces or tabs following it - "is not a closing sequence, but counts as part of the contents of the" - "heading:" - "````````````````````````````````" @@ -766,8 +769,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n# foo#\n.\n

    foo#

    " - "````````````````````````````````" -- "Backslash-escaped `#` characters do not count as part" -- "of the closing sequence:" +- "Backslash-escaped `#`" +- " characters do not count as part\nof the closing sequence:" - "````````````````````````````````" - example - "### foo \\###\n## foo #\\##\n# foo \\#\n." @@ -795,7 +798,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \n

    " - "````````````````````````````````" - "## Setext headings" -- "A [setext heading](@) consists of one or more" +- "A [setext heading](@)" +- consists of one or more - "lines of text, not interrupted by a blank line, of which the first line" - "does not\nhave more than 3 spaces of indentation, followed by\na" - "[setext heading underline]. The lines of text must be such" @@ -804,11 +808,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "interpretable as a [code fence], [ATX heading][ATX headings" - "],\n[block quote][block quotes], [thematic break][thematic breaks]," - "[list item][list items], or [HTML block][HTML blocks]." -- "A [setext heading underline](@) is a sequence of" -- "`=` characters or a sequence of `-` characters, with no more than" -- "3\nspaces of indentation and any number of trailing spaces or tabs." -- "The heading is a level 1 heading if `=` characters are used in" -- "the [setext heading underline], and a level 2 heading if `-`" +- "A [setext heading underline](@) is a sequence of\n`=`" +- "characters or a sequence of `-`" +- "characters, with no more than 3" +- spaces of indentation and any number of trailing spaces or tabs. +- "The heading is a level 1 heading if `=`" +- " characters are used in\nthe [setext heading underline]" +- ", and a level 2 heading if `-`" - characters are used. The contents of the heading are the result - "of parsing the preceding lines of text as CommonMark inline\ncontent." - "In general, a setext heading need not be preceded or followed by a" @@ -888,8 +894,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    <a title="a lot

    " - "

    of dashes"/>

    " - "````````````````````````````````" -- "The setext heading underline cannot be a [lazy continuation" -- "line] in a list item or block quote:" +- "The setext heading underline cannot be a [lazy continuation\nline]" +- "in a list item or block quote:" - "````````````````````````````````" - example - "> Foo\n---\n.\n
    \n

    Foo

    " @@ -951,7 +957,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "\\> foo\n------\n.\n

    > foo

    " - "````````````````````````````````" -- "**Compatibility note:** Most existing Markdown implementations" +- "**Compatibility note:**" +- Most existing Markdown implementations - do not allow the text of setext headings to span multiple lines. - But there is no consensus about how to interpret - "``` markdown\nFoo\nbar\n---\nbaz\n```" @@ -975,8 +982,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Foo\nbar\n\n---\n\nbaz\n.\n

    Foo\nbar

    " - "
    \n

    baz

    " - "````````````````````````````````" -- "or use a thematic break that cannot count as a [setext heading" -- "underline], such as" +- "or use a thematic break that cannot count as a [setext heading\nunderline" +- "], such as" - "````````````````````````````````" - example - "Foo\nbar\n* * *\nbaz\n.\n

    Foo" @@ -989,11 +996,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - baz

    - "````````````````````````````````" - "## Indented code blocks" -- "An [indented code block](@) is composed of one or more" -- "[indented chunks] separated by blank lines.\nAn" -- "[indented chunk](@)" -- "is a sequence of non-blank lines," -- each preceded by four or more spaces of indentation. The contents of the code +- "An [indented code block](@)" +- " is composed of one or more\n[indented chunks] separated by blank lines." +- "An [indented chunk](@) is a sequence of non-blank lines" +- ",\neach preceded by four or more spaces of indentation. The contents of the code" - "block are the literal contents of the lines, including trailing\n[line endings]" - ", minus four spaces of indentation.\nAn indented code block has no [" - "info string]." @@ -1082,8 +1088,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " foo \n.\n
    foo  \n
    " - "````````````````````````````````" - "## Fenced code blocks" -- "A [code fence](@) is a sequence" -- "of at least three consecutive backtick characters (`` ` ``" +- "A [code fence](@)" +- " is a sequence\nof at least three consecutive backtick characters (`` ` ``" - ") or\ntildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA" - "[fenced code block](@)" @@ -1096,8 +1102,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - characters. (The reason for this restriction is that otherwise - some inline code would be incorrectly interpreted as the - beginning of a fenced code block.) -- "The content of the code block consists of all subsequent lines, until" -- "a closing [code fence] of the same type as the code block" +- "The content of the code block consists of all subsequent lines, until\na closing" +- "[code fence] of the same type as the code block" - "began with (backticks or tildes), and with at least as" - many backticks - or tildes as the opening code fence. @@ -1166,9 +1172,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "~~~~\naaa\n~~~\n~~~~\n.\n
    aaa"
     - "~~~\n
    " - "````````````````````````````````" -- Unclosed code blocks are closed by the end of the document -- "(or the enclosing [block quote][block quotes] or [list item][list" -- "items]):" +- "Unclosed code blocks are closed by the end of the document\n(or the enclosing" +- "[block quote][block quotes] or [list item][list items]):" - "````````````````````````````````" - "example\n```\n.\n
    " - "````````````````````````````````" @@ -1302,7 +1307,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````````````````````" - "## HTML blocks" -- "An [HTML block](@) is a group of lines that is treated" +- "An [HTML block](@)" +- is a group of lines that is treated - as raw HTML (and will not be escaped in HTML output). - "There are seven kinds of [HTML block], which can be defined by their" - start and end conditions. The block begins with a line that meets a @@ -1314,7 +1320,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " containing the current HTML\nblock, if no line is encountered that meets the [" - "end condition]. If\nthe first line meets both the [start condition]" - " and the [end\ncondition], the block will contain just that line." -- "1. **Start condition:** line begins with the string ``" - ", or the end of the line.\\\n**End condition:**" @@ -1325,11 +1332,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "**End condition:** line contains the string `-->`." - "3. **Start condition:** line begins with the string ``." -- "4. **Start condition:** line begins with the string ``." -- "5. **Start condition:** line begins with the string\n``." +- "5." +- "**Start condition:** line begins with the string\n``." - "6." - "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" - "line is followed by a [blank line]." - "7." -- "**Start condition:** line begins with a complete [open tag]" -- "(with any [tag name] other than `pre`, `script`," -- "`style`, or `textarea`" +- "**Start condition:**" +- " line begins with a complete [open tag]\n(with any [tag name]" +- " other than `pre`, `script`,\n`style`, or `textarea`" - ") or a complete [closing tag]," - "followed by zero or more spaces and tabs, followed by the end of the" - "line.\\\n**End condition:** line is followed by a [blank line]." -- HTML blocks continue until they are closed by their appropriate -- "[end condition], or the last line of the document or other" +- "HTML blocks continue until they are closed by their appropriate\n[end condition]" +- ", or the last line of the document or other" - "[container\nblock](#container-blocks). This means any HTML" - "**within an HTML\nblock**" - that might otherwise be recognised as a start condition will @@ -1500,8 +1509,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n*foo*\n\n.\n\n*foo*" - "" - "````````````````````````````````" -- "In this case, we get a raw HTML block that just includes" -- "the ``" +- "In this case, we get a raw HTML block that just includes\nthe" +- "``" - tag (because it ends with the following blank - "line). So the contents get interpreted as CommonMark:" - "````````````````````````````````" @@ -1509,8 +1518,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n*foo*\n\n\n.\n" - "

    foo

    \n
    " - "````````````````````````````````" -- "Finally, in this case, the `` tags are interpreted" -- "as [raw HTML] *inside*" +- "Finally, in this case, the ``" +- " tags are interpreted\nas [raw HTML] *inside*" - the CommonMark paragraph. (Because - "the tag is not on a line by itself, we get inline HTML" - "rather than an [HTML block].)" @@ -1519,9 +1528,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo*\n." - "

    foo

    " - "````````````````````````````````" -- HTML tags designed to contain literal content -- "(`pre`, `script`, `style`, `textarea`), comments, processing" -- "instructions,\nand declarations are treated somewhat differently." +- "HTML tags designed to contain literal content\n(`pre`, `script`," +- "`style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently." - "Instead of ending at the first blank line, these blocks" - end at the first line containing a corresponding end tag. - "As a result, these blocks can contain blank lines:" @@ -1658,9 +1667,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "This rule differs from John Gruber's original Markdown syntax" - "specification, which says:" -- "> The only restrictions are that block-level HTML elements —" -- "> e.g. `
    `, ``, `
    `,"
    -- "`

    `" +- ">" +- "The only restrictions are that block-level HTML elements —\n> e.g." +- "`

    `, `
    `, `
    `, `

    `" - ", etc. — must be separated from\n>" - "surrounding content by blank lines, and the start and end tags of the" - "> block should not be indented with spaces or tabs." @@ -1720,8 +1729,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "</td>\n

    \n " - "
    " - "````````````````````````````````" -- "Fortunately, blank lines are usually not necessary and can be\ndeleted." -- "The exception is inside `
    `"
    +- "Fortunately, blank lines are usually not necessary and can be"
    +- "deleted.  The exception is inside `
    `"
     - " tags, but as described\n[above][HTML blocks]"
     - ", raw HTML blocks starting with `
    `\n*can* contain blank lines."
     - "## Link reference definitions"
    @@ -1987,14 +1996,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "

    aaa

    " - "````````````````````````````````" - "# Container blocks" -- "A [container block](#container-blocks) is a block that has other" +- "A [container block](#container-blocks)" +- is a block that has other - "blocks as its contents. There are two basic kinds of container blocks:\n[" - "block quotes] and [list items].\n[Lists] are meta-containers for" - "[list items]." - We define the syntax for container blocks recursively. The general - "form of the definition is:" -- "> If X is a sequence of blocks, then the result of" -- "> transforming X in such-and-such a way is a container of type Y" +- "> If X is a sequence of blocks, then the result of\n>" +- transforming X in such-and-such a way is a container of type Y - "> with these blocks as its content." - "So, we explain what counts as a block quote or list item by explaining" - how these can be *generated* @@ -2003,21 +2013,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*\nthese constructions. (A recipe is provided below in the section entitled" - "[A parsing strategy](#appendix-a-parsing-strategy).)" - "## Block quotes" -- "A [block quote marker](@)," -- "optionally preceded by up to three spaces of indentation," +- "A [block quote marker](@)" +- ",\noptionally preceded by up to three spaces of indentation," - "consists of (a) the character `>`" - together with a following space of - "indentation, or (b) a single character `>` not followed by a" - "space of\nindentation.\n\nThe following rules define [block quotes]:" -- 1. **Basic case. -- "** If a string of lines *Ls*" +- "1." +- "**Basic case.** If a string of lines *Ls*" - " constitute a sequence\n of blocks *Bs*" - ", then the result of prepending a [block quote\n marker]" - " to the beginning of each line in *Ls*\n is a" - "[block quote](#block-quotes) containing *Bs*." -- 2. **Laziness. -- "** If a string of lines *Ls* constitute a" -- "[block\n quote](#block-quotes) with contents *Bs*" +- "2." +- "**Laziness.** If a string of lines *Ls*" +- " constitute a [block\n quote](#block-quotes) with contents" +- "*Bs*" - ", then the result of deleting\n the initial [block quote marker]" - from one or - more lines in which the next character other than a space or tab after the @@ -2027,8 +2038,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - is text - "that will be parsed as part of the content of a paragraph, but does" - not occur at the beginning of the paragraph. -- 3. **Consecutiveness. -- "** A document cannot contain two [block\n quotes]" +- "3." +- "**Consecutiveness.**" +- " A document cannot contain two [block\n quotes]" - "in a row unless there is a [blank line] between them." - "Nothing else counts as a [block quote](#block-quotes)." - "Here is a simple example:" @@ -2059,8 +2071,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    > # Foo\n> bar\n> baz"
     - "
    " - "````````````````````````````````" -- "The Laziness clause allows us to omit the `>` before" -- "[paragraph continuation text]:" +- "The Laziness clause allows us to omit the `>`" +- " before\n[paragraph continuation text]:" - "````````````````````````````````" - example - "> # Foo\n> bar\nbaz\n.\n
    " @@ -2092,8 +2104,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • foo
  • \n\n
    \n
      " - "
    • bar
    • \n
    " - "````````````````````````````````" -- "For the same reason, we can't omit the `> ` in front of" -- "subsequent lines of an indented or fenced code block:" +- "For the same reason, we can't omit the `> `" +- " in front of\nsubsequent lines of an indented or fenced code block:" - "````````````````````````````````" - example - "> foo\n bar\n.\n
    \n
    foo"
    @@ -2106,8 +2118,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    \n
    " - "

    foo

    \n
    " - "````````````````````````````````" -- "Note that in the following case, we have a [lazy" -- "continuation line]:" +- "Note that in the following case, we have a [lazy\ncontinuation line" +- "]:" - "````````````````````````````````" - example - "> foo\n - bar\n.\n
    \n

    foo" @@ -2115,8 +2127,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "To see why, note that in" - "```markdown\n> foo\n> - bar\n```" -- "the `- bar` is indented too far to start a list, and" -- "can't\nbe an indented code block because indented code blocks cannot" +- "the `- bar`" +- "is indented too far to start a list, and can't" +- be an indented code block because indented code blocks cannot - "interrupt paragraphs, so it is [paragraph continuation text]." - "A block quote can be empty:" - "````````````````````````````````" @@ -2137,8 +2150,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n\n> bar\n.\n

    \n

    foo

    " - "
    \n
    \n

    bar

    \n
    " - "````````````````````````````````" -- "(Most current Markdown implementations, including John Gruber's" -- "original `Markdown.pl`" +- "(Most current Markdown implementations, including John Gruber's\noriginal" +- "`Markdown.pl`" - ", will parse this example as a single block quote" - with two paragraphs. But it seems better to allow the author to decide - whether two block quotes or one are wanted.) @@ -2185,8 +2198,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> bar\n>\nbaz\n.\n
    \n

    bar

    " - "
    \n

    baz

    " - "````````````````````````````````" -- It is a consequence of the Laziness rule that any number -- "of initial `>`" +- "It is a consequence of the Laziness rule that any number\nof initial" +- "`>`" - "s may be omitted on a continuation line of a\nnested block quote:" - "````````````````````````````````" - example @@ -2200,8 +2213,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n

    foo\nbar\nbaz

    \n
    " - "
    \n" - "````````````````````````````````" -- "When including an indented code block in a block quote," -- "remember that the [block quote marker] includes\nboth the `>`" +- "When including an indented code block in a block quote,\nremember that the [" +- "block quote marker] includes\nboth the `>`" - and a following space of indentation. So *five spaces* - " are needed\nafter the `>`:" - "````````````````````````````````" @@ -2211,18 +2224,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n

    not code

    \n
    " - "````````````````````````````````" - "## List items" -- "A [list marker](@) is a" -- "[bullet list marker] or an [ordered list marker]." -- "A [bullet list marker](@)" -- "is a `-`, `+`, or `*` character." +- "A [list marker](@)" +- " is a\n[bullet list marker] or an [ordered list marker]." +- "A [bullet list marker](@)\nis a `-`, `+`, or" +- "`*` character." - "An [ordered list marker](@)" -- "is a sequence of 1--9 arabic digits (`0-9`)" -- ", followed by either a\n`.` character or a `)`" +- "is a sequence of 1--9 arabic digits (`0-9`" +- "), followed by either a\n`.` character or a `)`" - character. (The reason for the length - limit is that with 10 digits we start seeing integer overflows - "in some browsers.)\n\nThe following rules define [list items]:" -- 1. **Basic case. -- "** If a sequence of lines *Ls*" +- "1." +- "**Basic case.** If a sequence of lines *Ls*" - " constitute a sequence of\n blocks *Bs*" - "starting with a character other than a space or tab, and *M*" - " is\n a list marker of width *W* followed by 1 ≤" @@ -2240,9 +2253,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*Ls*" - "must not begin with a blank line, and (b) if" - "the list item is ordered, the start number must be 1." -- "2." -- "If any line is a [thematic break][thematic breaks] then" -- that line is not a list item. +- "2. If any line is a [thematic break][thematic breaks]" +- " then\n that line is not a list item." - "For example, let *Ls* be the lines" - "````````````````````````````````" - example @@ -2308,11 +2320,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    one

    \n

    two

    \n" - "\n\n" - "````````````````````````````````" -- "Here `two` occurs in the same column as the list marker `1." -- "`,\nbut is actually contained in the list item, because there is" +- "Here `two` occurs in the same column as the list marker `1.`" +- ",\nbut is actually contained in the list item, because there is" - sufficient indentation after the last containing blockquote marker. -- The converse is also possible. -- "In the following example, the word `two`" +- "The converse is also possible. In the following example, the word" +- "`two`" - "occurs far to the right of the initial text of the list item," - "`one`" - ", but" @@ -2383,11 +2395,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "-1. not ok\n.\n

    -1. not ok

    " - "````````````````````````````````" -- 2. **Item starting with indented code. -- "** If a sequence of lines *Ls*" -- constitute a sequence of blocks *Bs* -- " starting with an indented code\n block, and *M*" -- is a list marker of width *W* +- 2. **Item starting with indented code.** +- " If a sequence of lines *Ls*\n constitute a sequence of blocks" +- "*Bs* starting with an indented code\n block, and" +- "*M* is a list marker of width *W*" - " followed by\n one space of indentation, then the result of prepending" - "*M* and the\n following space to the first line of" - "*Ls*, and indenting subsequent lines\n of *Ls* by" @@ -2471,8 +2482,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n

    bar

    \n" - "" - "````````````````````````````````" -- 3. **Item starting with a blank line. -- "** If a sequence of lines *Ls*" +- 3. **Item starting with a blank line.** +- If a sequence of lines *Ls* - "starting with a single [blank line] constitute a (possibly empty)" - "sequence of blocks *Bs*, and *M* is a list marker of width" - "*W*,\n then the result of prepending *M*" @@ -2538,8 +2549,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n*\n\nfoo\n1.\n.\n

    foo\n*

    " - "

    foo\n1.

    " - "````````````````````````````````" -- 4. **Indentation. -- "** If a sequence of lines *Ls*" +- 4. **Indentation.** If a sequence of lines +- "*Ls*" - constitutes a list item - "according to rule #1, #2, or #3, then the result" - "of preceding each line\n of *Ls*" @@ -2582,9 +2593,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " indented code\n\n > A block quote." - "
    " - "````````````````````````````````" -- 5. **Laziness. -- "** If a string of lines *Ls* constitute a" -- "[list\n item](#list-items) with contents *Bs*" +- 5. **Laziness.** If a string of lines +- "*Ls* constitute a [list\n item](#list-items)" +- with contents *Bs* - ", then the result of deleting" - some or all of the indentation from one or more lines in which the - "next character other than a space or tab after the indentation is\n [" @@ -2621,11 +2632,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "continued here.

    \n\n\n" - "" - "````````````````````````````````" -- "6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items)." -- The rules for sublists follow from the general rules -- "[above][List items]. A sublist must be indented the same number" +- "6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items)." +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number" - of spaces of indentation a paragraph would need to be in order to be included - "in the list item.\n\nSo, in this case we need two spaces indent:" - "````````````````````````````````" @@ -2678,23 +2689,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "### Motivation" - "John Gruber's Markdown spec says the following about list items:" -- "1." -- "\"List markers typically start at the left margin, but may be indented" +- "1. \"" +- "List markers typically start at the left margin, but may be indented" - by up to three spaces. List markers must be followed by one or more - "spaces or a tab.\"" -- "2." -- "\"To make lists look nice, you can wrap items with hanging indents...." +- "2. \"" +- "To make lists look nice, you can wrap items with hanging indents...." - "But if you don't want to, you don't have to.\"" - "3. \"List items may consist of multiple paragraphs. Each subsequent" - paragraph in a list item must be indented by either 4 spaces or one - "tab.\"" - "4. \"It looks nice if you indent every line of the subsequent paragraphs," - "but here again, Markdown will allow you to be lazy.\"" -- "5." -- "\"To put a blockquote within a list item, the blockquote's `>`" -- "delimiters need to be indented.\"" -- "6." -- "\"To put a code block within a list item, the code block needs to be" +- "5. \"To put a blockquote within a list item, the blockquote" +- "'s `>`\n delimiters need to be indented.\"" +- "6. \"" +- "To put a code block within a list item, the code block needs to be" - "indented twice — 8 spaces or two tabs.\"" - These rules specify that a paragraph under a list item must be indented - "four spaces (presumably, from the left margin, rather than the start of" @@ -2708,8 +2718,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "block elements under a list item, including other" - "lists, must be indented four spaces. This principle has been called the" - "*four-space rule*." -- "The four-space rule is clear and principled, and if the reference" -- "implementation `Markdown.pl`" +- "The four-space rule is clear and principled, and if the reference\nimplementation" +- "`Markdown.pl`" - "had followed it, it probably would have" - "become the standard. However, `Markdown.pl`" - allowed paragraphs and @@ -2774,8 +2784,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as a list item with a subparagraph, even though the paragraph `bar`" - "is not indented as far as the first paragraph `foo`:" - "``` markdown\n 10. foo\n\n bar \n```" -- "Arguably this text does read like a list item with `bar` as a" -- "subparagraph," +- "Arguably this text does read like a list item with `bar`" +- "as a subparagraph," - which may count in favor of the proposal. - "However, on this proposal indented" - code would have to be indented six spaces after the list marker. @@ -2795,9 +2805,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule in cases where the list marker plus its initial indentation - "takes four spaces (a common case), but diverge in other cases." - "## Lists" -- "A [list](@) is a sequence of one or more" -- "list items [of the same type]. The list items" -- may be separated by any number of blank lines. +- "A [list](@)" +- " is a sequence of one or more\nlist items [of the same type]" +- ". The list items\nmay be separated by any number of blank lines." - "Two list items are [of the same type](@)" - "if they begin with a [list marker] of the same type." - Two list markers are of the @@ -2812,7 +2822,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "of an [ordered list] is determined by the list number of" - its initial list item. The numbers of subsequent list items are - disregarded. -- "A list is [loose](@) if any of its constituent" +- "A list is [loose](@)" +- if any of its constituent - "list items are separated by blank lines, or if any of its constituent" - list items directly contain two block-level elements with a blank line - "between them. Otherwise a list is [tight](@)" @@ -2839,7 +2850,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
      \n
    • bar
    • \n
    • baz
    • " - "
    " - "````````````````````````````````" -- "`Markdown.pl` does not allow this, through fear of triggering a list" +- "`Markdown.pl`" +- "does not allow this, through fear of triggering a list" - "via a numeral in a hard-wrapped line:" - "``` markdown\nThe number of windows in my house is\n14." - "The number of doors is 6.\n```" @@ -2850,8 +2862,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to start lists without blank lines:" - "``` markdown\nI need to buy\n- new shoes\n- a coat" - "- a plane ticket\n```\n\nSecond, we are attracted to a" -- "> [principle of uniformity](@):" -- "> if a chunk of text has a certain\n>" +- ">" +- "[principle of uniformity](@)" +- ":\n> if a chunk of text has a certain\n>" - "meaning, it will continue to have the same meaning when put into a\n>" - container block (such as a list item or blockquote). - "(Indeed, the spec for [list items] and [block quotes]" @@ -3073,15 +3086,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "a literal\nbacktick." - "## Code spans" - "A [backtick string](@)" -- "is a string of one or more backtick characters (`` ` ``) that" -- "is neither\npreceded nor followed by a backtick." -- "A [code span](@) begins with a backtick string and ends with" +- "is a string of one or more backtick characters (`` ` ``" +- ") that is neither\npreceded nor followed by a backtick." +- "A [code span](@)" +- begins with a backtick string and ends with - a backtick string of equal length. The contents of the code span are - "the characters between these two backtick strings, normalized in the\nfollowing ways:" - "- First, [line endings] are converted to [spaces]." -- "- If the resulting string both begins *and* ends with a [space]" -- "character, but does not consist entirely of [space]" -- "characters, a single [space] character is removed from the" +- "- If the resulting string both begins *and*" +- " ends with a [space]\n character, but does not consist entirely of [" +- "space]\n characters, a single [space] character is removed from the" - front and back. This allows you to include code that begins - "or ends with backtick characters, which must be separated by" - whitespace from the opening or closing backtick strings. @@ -3139,8 +3153,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`foo bar \nbaz`\n." - "

    foo bar baz

    " - "````````````````````````````````" -- Note that browsers will typically collapse consecutive spaces -- "when rendering ``" +- "Note that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:" - "code{white-space: pre-wrap;}" - Note that backslash escapes do not work in code spans. All backslashes @@ -3150,8 +3163,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`foo\\`bar`\n." - "

    foo\\bar`

    " - "````````````````````````````````" -- "Backslash escapes are never needed, because one can always choose a" -- string of *n* +- "Backslash escapes are never needed, because one can always choose a\nstring of" +- "*n*" - "backtick characters as delimiters, where the code does" - not contain any strings of exactly *n* backtick characters. - "````````````````````````````````" @@ -3224,14 +3237,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    `foobar

    " - "````````````````````````````````" - "## Emphasis and strong emphasis" -- "John Gruber's original [Markdown syntax" -- "description](https://daringfireball.net/projects/markdown/syntax#em" -- ") says:" -- "> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of" -- "> emphasis. Text wrapped with one `*` or `_`" -- " will be wrapped with an HTML\n> `` tag; double" -- "`*`'s or `_`'s will be wrapped with an HTML" -- "``\n> tag." +- "John Gruber's original" +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown/syntax#em)" +- "says:" +- ">" +- "Markdown treats asterisks (`*`) and underscores (`_`" +- ") as indicators of\n> emphasis. Text wrapped with one `*` or" +- "`_` will be wrapped with an HTML\n> ``" +- "tag; double `*`'s or `_`'s will be wrapped" +- "with an HTML ``\n> tag." - "This is enough for most users, but these rules leave much undecided," - "especially when it comes to nested emphasis. The original\n`Markdown.pl`" - " test suite makes it clear that triple `***` and\n`___`" @@ -3244,8 +3259,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "is clear and they are useful (especially in contexts like bibliography\nentries):" - "``` markdown\n*emph *with emph* in it*" - "**strong **with strong** in it**\n```" -- Many implementations have also restricted intraword emphasis to -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - "forms, to avoid unwanted emphasis in words containing" - internal underscores. (It is best practice to put these in code - "spans, but users often do not.)" @@ -3253,25 +3267,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "no emphasis: foo_bar_baz\n```" - "The rules given below capture all of these patterns, while allowing" - for efficient parsing strategies that do not backtrack. -- "First, some definitions. A [delimiter run](@) is either" -- "a sequence of one or more `*`" +- "First, some definitions. A [delimiter run](@)" +- " is either\na sequence of one or more `*`" - " characters that is not preceded or\nfollowed by a non-backslash-escaped" - "`*` character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped" - "`_` character." -- "A [left-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not followed by [Unicode whitespace" -- "],\nand either (2a) not followed by a [Unicode punctuation character]" -- ", or\n(2b) followed by a [Unicode punctuation character] and" -- "preceded by [Unicode whitespace] or a [Unicode punctuation character]." -- "For purposes of this definition, the beginning and the end of" +- "A [left-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not followed by [" +- "Unicode whitespace],\nand either (2a) not followed by a [" +- "Unicode punctuation character], or\n(2b) followed by a [" +- "Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [" +- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. -- "A [right-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not preceded by [Unicode whitespace" -- "],\nand either (2a) not preceded by a [Unicode punctuation character]" -- ", or\n(2b) preceded by a [Unicode punctuation character] and" -- "followed by [Unicode whitespace] or a [Unicode punctuation character]." -- "For purposes of this definition, the beginning and the end of" +- "A [right-flanking delimiter run](@)" +- " is\na [delimiter run] that is (1) not preceded by [" +- "Unicode whitespace],\nand either (2a) not preceded by a [" +- "Unicode punctuation character], or\n(2b) preceded by a [" +- "Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [" +- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of" - "the line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." - " - left-flanking but not right-flanking:\n\n ```\n ***abc" - " _abc\n **\"abc\"\n _\"abc\"\n ```" @@ -3291,39 +3305,46 @@ input_file: tests/inputs/markdown/commonmark_spec.md - and its rules for distinguishing left- and right-flanking runs - are a bit more complex than the ones given here.) - "The following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)" -- "iff (if and only if) it is part of a [left-flanking" -- "delimiter run]." -- "2. A single `_` character [can open emphasis] iff" -- "it is part of a [left-flanking delimiter run]" -- "and either (a) not part of a [right-flanking delimiter run]" -- "or (b) part of a [right-flanking delimiter run]" -- "preceded by a [Unicode punctuation character]." -- "3. A single `*` character [can close emphasis](@)" +- "1." +- "A single `*` character [can open emphasis](@)" +- "iff (if and only if) it is part of a [" +- "left-flanking delimiter run]." +- "2." +- "A single `_`" +- " character [can open emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]\n and either (a) not part of a" +- "[right-flanking delimiter run]\n or (b) part of a [" +- "right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." +- "3." +- "A single `*` character [can close emphasis](@)" - "iff it is part of a [right-flanking delimiter run]." -- "4. A single `_` character [can close emphasis] iff" -- "it is part of a [right-flanking delimiter run]" -- "and either (a) not part of a [left-flanking delimiter run]" -- "or (b) part of a [left-flanking delimiter run]" -- "followed by a [Unicode punctuation character]." -- "5. A double `**` [can open strong emphasis](@)" +- "4." +- "A single `_`" +- " character [can close emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]\n and either (a) not part of a" +- "[left-flanking delimiter run]\n or (b) part of a [" +- "left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "5." +- "A double `**` [can open strong emphasis](@)" - "iff it is part of a [left-flanking delimiter run]." -- "6. A double `__` [can open strong emphasis] iff" -- "it is part of a [left-flanking delimiter run]" -- "and either (a) not part of a [right-flanking delimiter run]" -- "or (b) part of a [right-flanking delimiter run]" -- "preceded by a [Unicode punctuation character]." -- "7. A double `**` [can close strong emphasis](@)" +- "6." +- "A double `__`" +- " [can open strong emphasis] iff\n it is part of a [" +- "left-flanking delimiter run]\n and either (a) not part of a" +- "[right-flanking delimiter run]\n or (b) part of a [" +- "right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." +- "7." +- "A double `**` [can close strong emphasis](@)" - "iff it is part of a [right-flanking delimiter run]." -- "8. A double `__` [can close strong emphasis] iff" -- "it is part of a [right-flanking delimiter run]" -- "and either (a) not part of a [left-flanking delimiter run]" -- "or (b) part of a [left-flanking delimiter run]" -- "followed by a [Unicode punctuation character]." -- "9." -- "Emphasis begins with a delimiter that [can open emphasis] and ends" -- "with a delimiter that [can close emphasis], and that uses the same" -- "character (`_` or `*`" +- "8." +- "A double `__`" +- " [can close strong emphasis] iff\n it is part of a [" +- "right-flanking delimiter run]\n and either (a) not part of a" +- "[left-flanking delimiter run]\n or (b) part of a [" +- "left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "9. Emphasis begins with a delimiter that [can open emphasis]" +- " and ends\n with a delimiter that [can close emphasis]" +- ", and that uses the same\n character (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs]" - ". If one of the delimiters can both" @@ -3331,10 +3352,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - delimiter runs containing the opening and closing delimiters - must not be a multiple of 3 unless both lengths are - multiples of 3. -- 10. Strong emphasis begins with a delimiter that -- " [can open strong emphasis] and ends with a delimiter that\n [" -- "can close strong emphasis], and that uses the same character\n (`_`" -- "or `*`" +- "10. Strong emphasis begins with a delimiter that\n [can open strong emphasis" +- "] and ends with a delimiter that\n [can close strong emphasis]" +- ", and that uses the same character\n (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs]" - ". If one of the delimiters can both open" @@ -3342,10 +3362,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the delimiter runs containing the opening and closing - delimiters must not be a multiple of 3 unless both lengths - are multiples of 3. -- "11. A literal `*` character cannot occur at the beginning or end of" +- "11." +- "A literal `*` character cannot occur at the beginning or end of" - "`*`-delimited emphasis or `**`-delimited strong emphasis" - ", unless it\n is backslash-escaped." -- "12. A literal `_` character cannot occur at the beginning or end of" +- "12." +- "A literal `_` character cannot occur at the beginning or end of" - "`_`-delimited emphasis or `__`-delimited strong emphasis" - ", unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings," @@ -3353,8 +3375,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "13. The number of nestings should be minimized. Thus, for example," - "an interpretation `...` is always preferred to" - "`...`." -- "14. An interpretation `...` is always" -- "preferred to `...`." +- "14." +- "An interpretation `...`" +- " is always\n preferred to `...`" +- "." - "15. When two potential emphasis or strong emphasis spans overlap," - so that the second begins before the first ends and ends after - "the first ends, the first takes precedence. Thus, for example," @@ -3379,14 +3403,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "*foo bar*\n.\n

    foo bar

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `*` is followed by" -- "whitespace, and hence not part of a [left-flanking delimiter run]:" +- "This is not emphasis, because the opening `*`" +- " is followed by\nwhitespace, and hence not part of a [" +- "left-flanking delimiter run]:" - "````````````````````````````````" - "example\na * foo bar*\n.\n

    a * foo bar*

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `*` is preceded" -- "by an alphanumeric and followed by punctuation, and hence\nnot part of a [" -- "left-flanking delimiter run]:" +- "This is not emphasis, because the opening `*`" +- " is preceded\nby an alphanumeric and followed by punctuation, and hence" +- "not part of a [left-flanking delimiter run]:" - "````````````````````````````````" - example - "a*\"foo\"*\n." @@ -3420,13 +3445,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "_foo bar_\n.\n

    foo bar

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `_` is followed by" -- "whitespace:" +- "This is not emphasis, because the opening `_`" +- " is followed by\nwhitespace:" - "````````````````````````````````" - "example\n_ foo bar_\n.\n

    _ foo bar_

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `_` is preceded" -- "by an alphanumeric and followed by punctuation:" +- "This is not emphasis, because the opening `_`" +- " is preceded\nby an alphanumeric and followed by punctuation:" - "````````````````````````````````" - example - "a_\"foo\"_\n." @@ -3446,7 +3471,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "пристаням_стремятся_\n." - "

    пристаням_стремятся_

    " - "````````````````````````````````" -- "Here `_` does not generate emphasis, because the first delimiter run" +- "Here `_`" +- "does not generate emphasis, because the first delimiter run" - "is right-flanking and the second left-flanking:" - "````````````````````````````````" - example @@ -3467,8 +3493,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n_foo*\n.\n

    _foo*

    " - "````````````````````````````````" -- "This is not emphasis, because the closing `*` is preceded by" -- "whitespace:" +- "This is not emphasis, because the closing `*`" +- " is preceded by\nwhitespace:" - "````````````````````````````````" - "example\n*foo bar *\n.\n

    *foo bar *

    " - "````````````````````````````````" @@ -3476,8 +3502,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n*foo bar\n*\n.\n

    *foo bar\n*

    " - "````````````````````````````````" -- "This is not emphasis, because the second `*` is" -- preceded by punctuation and followed by an alphanumeric +- "This is not emphasis, because the second `*`" +- " is\npreceded by punctuation and followed by an alphanumeric" - "(hence it is not part of a [right-flanking delimiter run]:" - "````````````````````````````````" - "example\n*(*foo)\n.\n

    *(*foo)

    " @@ -3495,13 +3521,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobar

    " - "````````````````````````````````" - "Rule 4:" -- "This is not emphasis, because the closing `_` is preceded by" -- "whitespace:" +- "This is not emphasis, because the closing `_`" +- " is preceded by\nwhitespace:" - "````````````````````````````````" - "example\n_foo bar _\n.\n

    _foo bar _

    " - "````````````````````````````````" -- "This is not emphasis, because the second `_` is" -- "preceded by punctuation and followed by an alphanumeric:" +- "This is not emphasis, because the second `_`" +- " is\npreceded by punctuation and followed by an alphanumeric:" - "````````````````````````````````" - "example\n_(_foo)\n.\n

    _(_foo)

    " - "````````````````````````````````" @@ -3542,9 +3568,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n** foo bar**\n.\n

    ** foo bar**

    " - "````````````````````````````````" -- "This is not strong emphasis, because the opening `**` is preceded" -- "by an alphanumeric and followed by punctuation, and hence\nnot part of a [" -- "left-flanking delimiter run]:" +- "This is not strong emphasis, because the opening `**`" +- " is preceded\nby an alphanumeric and followed by punctuation, and hence" +- "not part of a [left-flanking delimiter run]:" - "````````````````````````````````" - example - "a**\"foo\"**\n." @@ -3569,8 +3595,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    " - "````````````````````````````````" -- "This is not strong emphasis, because the opening `__` is preceded" -- "by an alphanumeric and followed by punctuation:" +- "This is not strong emphasis, because the opening `__`" +- " is preceded\nby an alphanumeric and followed by punctuation:" - "````````````````````````````````" - example - "a__\"foo\"__\n." @@ -3611,8 +3637,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "(Nor can it be interpreted as an emphasized `*foo bar *`, because" - "of\nRule 11.)" -- "This is not strong emphasis, because the second `**` is" -- "preceded by punctuation and followed by an alphanumeric:" +- "This is not strong emphasis, because the second `**`" +- " is\npreceded by punctuation and followed by an alphanumeric:" - "````````````````````````````````" - "example\n**(**foo)\n.\n

    **(**foo)

    " - "````````````````````````````````" @@ -3651,8 +3677,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "__foo bar __\n.\n

    __foo bar __

    " - "````````````````````````````````" -- "This is not strong emphasis, because the second `__` is" -- "preceded by punctuation and followed by an alphanumeric:" +- "This is not strong emphasis, because the second `__`" +- " is\npreceded by punctuation and followed by an alphanumeric:" - "````````````````````````````````" - "example\n__(__foo)\n.\n

    __(__foo)

    " - "````````````````````````````````" @@ -3767,8 +3793,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo**bar***\n." - "

    foobar

    " - "````````````````````````````````" -- When the lengths of the interior closing and opening -- delimiter runs are *both* +- "When the lengths of the interior closing and opening\ndelimiter runs are *both*" - " multiples of 3, though,\nthey can match to create emphasis:" - "````````````````````````````````" - example @@ -4114,14 +4139,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "There are two basic kinds of links in Markdown. In [inline links]" - " the\ndestination and title are given immediately after the link text. In" - "[reference links] the destination and title are defined elsewhere in\nthe document." -- "A [link text](@) consists of a sequence of zero or more" -- "inline elements enclosed by square brackets (`[` and `]`" -- "). The\nfollowing rules apply:" +- "A [link text](@)" +- " consists of a sequence of zero or more\ninline elements enclosed by square brackets (" +- "`[` and `]`). The\nfollowing rules apply:" - "- Links may not contain other links, at any level of nesting. If" - multiple otherwise valid link definitions appear nested inside each - "other, the inner-most definition is used." -- "- Brackets are allowed in the [link text] only if (a)" -- they +- "- Brackets are allowed in the [link text]" +- only if (a) they - are backslash-escaped or (b) they appear as a matched pair of - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket" @@ -4131,33 +4156,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "than the brackets in link text. Thus, for example," - "`` [foo`]` `` could not be a link text, since the second" - "`]`\n is part of a code span." -- "- The brackets in link text bind more tightly than markers for" -- "[emphasis and strong emphasis]. Thus, for example," +- "- The brackets in link text bind more tightly than markers for\n [" +- "emphasis and strong emphasis]. Thus, for example," - "`*[foo*](url)` is a link." - "A [link destination](@) consists of either" -- "- a sequence of zero or more characters between an opening `<` and a" -- " closing `>` that contains no line endings or unescaped\n `<`" -- "or `>` characters, or" -- "- a nonempty sequence of characters that does not start with `<`," -- " does not include [ASCII control characters][ASCII control character]\n or [" -- "space] character, and includes parentheses only if (a) they are" +- "-" +- "a sequence of zero or more characters between an opening `<`" +- " and a\n closing `>` that contains no line endings or unescaped" +- "`<` or `>` characters, or" +- "-" +- "a nonempty sequence of characters that does not start with `<`" +- ",\n does not include [ASCII control characters][ASCII control character]\n or" +- "[space] character, and includes parentheses only if (a) they are" - backslash-escaped or (b) they are part of a balanced pair of - "unescaped parentheses.\n (Implementations may impose limits on parentheses nesting to" - "avoid performance issues, but at least three levels of nesting" - "should be supported.)\n\nA [link title](@) consists of either" -- "- a sequence of zero or more characters between straight double-quote" -- "characters (`\"`), including a `\"`" +- "-" +- "a sequence of zero or more characters between straight double-quote\n characters (`\"`" +- "), including a `\"`" - " character only if it is\n backslash-escaped, or" -- "- a sequence of zero or more characters between straight single-quote" -- "characters (`'`), including a `'`" +- "-" +- "a sequence of zero or more characters between straight single-quote\n characters (" +- "`'`), including a `'`" - " character only if it is\n backslash-escaped, or" -- "- a sequence of zero or more characters between matching parentheses" -- "(`(...)`), including a `(` or `)`" +- "-" +- "a sequence of zero or more characters between matching parentheses\n (`(...)`" +- "), including a `(` or `)`" - " character only if it is\n backslash-escaped." -- "Although [link titles] may span multiple lines, they may not contain" -- "a [blank line]." -- "An [inline link](@) consists of a [link text] followed immediately" -- "by a left parenthesis `(`" +- "Although [link titles] may span multiple lines, they may not contain\na" +- "[blank line]." +- "An [inline link](@)" +- " consists of a [link text] followed immediately\nby a left parenthesis `(`" - ", an optional [link destination], an optional\n[link title]" - ", and a right parenthesis `)`" - "." @@ -4165,8 +4195,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "ending.\nIf both [link destination] and [link title]" - "are present, they *must*" - " be\nseparated by spaces, tabs, and up to one line ending." -- "The link's text consists of the inlines contained" -- "in the [link text] (excluding the enclosing square brackets).\nThe link'" +- "The link's text consists of the inlines contained\nin the [link text" +- "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n`<...>`" - "if present, with backslash-escapes in effect as described" - "above. The link's title consists of the link title, excluding its" @@ -4359,7 +4389,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    link

    " - "````````````````````````````````" -- "(Note: `Markdown.pl` did allow double quotes inside a double-quoted" +- "(Note: `Markdown.pl`" +- did allow double quotes inside a double-quoted - "title, and its test suite included a test demonstrating this." - But it is hard to see a good rationale for the extra complexity this - "brings, since there are already many ways---backslash escaping," @@ -4456,8 +4487,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo *bar](baz*)\n." - "

    foo *bar

    " - "````````````````````````````````" -- "Note that brackets that *aren't* part of links do not take" -- "precedence:" +- "Note that brackets that *aren't*" +- " part of links do not take\nprecedence:" - "````````````````````````````````" - example - "*foo [bar* baz]\n." @@ -4506,16 +4537,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "matching reference link definitions, the one that comes first in the" - document is used. - (It is desirable in such cases to emit a warning.) -- "The link's URI and title are provided by the matching [link" -- "reference definition].\n\nHere is a simple example:" +- "The link's URI and title are provided by the matching [link\nreference definition" +- "].\n\nHere is a simple example:" - "````````````````````````````````" - example - "[foo][bar]\n\n[bar]: /url \"title\"\n." - "

    foo" - "````````````````````````````````" -- "The rules for the [link text] are the same as with" -- "[inline links]. Thus:" +- "The rules for the [link text] are the same as with\n[" +- "inline links]. Thus:" - "The link text may contain balanced brackets, but not unbalanced ones," - "unless they are escaped:" - "````````````````````````````````" @@ -4559,8 +4590,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    [foo bar baz]ref

    " - "````````````````````````````````" -- "(In the examples above, we have two [shortcut reference links]" -- "instead of one [full reference link].)" +- "(In the examples above, we have two [shortcut reference links]\ninstead of one" +- "[full reference link].)" - "The following cases illustrate the precedence of link text grouping over\nemphasis grouping:" - "````````````````````````````````" - example @@ -4636,9 +4667,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "link text and the link label, then in the following we will have" - "a single reference link, not two shortcut reference links, as\nintended:" - "``` markdown\n[foo]\n[bar]\n\n[foo]: /url1" -- "[bar]: /url2\n```" -- "(Note that [shortcut reference links] were introduced by Gruber" -- "himself in a beta version of `Markdown.pl`" +- "[bar]: /url2\n```\n\n(Note that [shortcut reference links]" +- " were introduced by Gruber\nhimself in a beta version of" +- "`Markdown.pl`" - ", but never included\nin the official syntax description. Without shortcut reference" - "links, it is harmless to allow space between the link text and" - "link label; but once shortcut references are introduced, it is" @@ -4651,8 +4682,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[bar][foo]\n." - "

    bar

    " - "````````````````````````````````" -- "Note that matching is performed on normalized strings, not parsed\ninline content." -- "So the following does not match, even though the" +- "Note that matching is performed on normalized strings, not parsed" +- "inline content. So the following does not match, even though the" - "labels define equivalent inline content:" - "````````````````````````````````" - example @@ -4839,7 +4870,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobaz

    " - "````````````````````````````````" -- "Here `[foo]` is not parsed as a shortcut reference, because it" +- "Here `[foo]`" +- "is not parsed as a shortcut reference, because it" - "is followed by a link label (even though `[bar]` is not defined" - "):" - "````````````````````````````````" @@ -4849,8 +4881,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    [foo]bar

    " - "````````````````````````````````" - "## Images" -- "Syntax for images is like the syntax for links, with one\ndifference." -- "Instead of [link text], we have an\n[image description](@)" +- "Syntax for images is like the syntax for links, with one" +- "difference. Instead of [link text], we have an" +- "[image description](@)" - ". The rules for this are the\nsame as for [link text]" - ", except that (a) an\nimage description starts with `![`" - "rather than `[`" @@ -4992,8 +5025,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \"Foo\" - "````````````````````````````````" -- "If you just want a literal `!" -- "` followed by bracketed text, you can\nbackslash-escape the opening" +- "If you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening" - "`[`:" - "````````````````````````````````" - example @@ -5012,18 +5045,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`<` and `>`" - ". They are parsed as links, with the URL or email address" - as the link label. -- "A [URI autolink](@) consists of `<`, followed by an" -- "[absolute URI] followed by `>`" +- "A [URI autolink](@) consists of `<`" +- ", followed by an\n[absolute URI] followed by `>`" - ". It is parsed as" - "a link to the URI, with the URI as the link's label." -- "An [absolute URI](@)," -- "for these purposes, consists of a [scheme] followed by a colon (`:`" +- "An [absolute URI](@)" +- ",\nfor these purposes, consists of a [scheme] followed by a colon (" +- "`:`" - ")\nfollowed by zero or more characters other than [ASCII control\ncharacters][" - "ASCII control character], [space], `<`, and `>`" - ".\nIf the URI includes these characters, they must be percent-encoded" - "(e.g. `%20` for a space)." -- "For purposes of this spec, a [scheme](@) is any sequence" -- of 2--32 characters beginning with an ASCII letter and followed +- "For purposes of this spec, a [scheme](@)" +- " is any sequence\nof 2--" +- 32 characters beginning with an ASCII letter and followed - "by any combination of ASCII letters, digits, or the symbols plus\n(\"+\"" - "), period (\".\"), or hyphen (\"-\")." - "Here are some valid autolinks:" @@ -5095,16 +5130,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    https://example.com/\\[\\

    " - "````````````````````````````````" -- "An [email autolink](@)" -- "consists of `<`, followed by an [email address],\nfollowed by" -- "`>`" +- "An [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is" - "`mailto:` followed by the email address." -- "An [email address](@),\nfor these purposes, is anything that matches" -- the -- "[non-normative regex from the HTML5" -- "spec](https://html.spec.whatwg.org/multipage/forms.html#e" -- "-mail-state-(type=email)):" +- "An [email address](@)" +- ",\nfor these purposes, is anything that matches\nthe" +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org/multipage/forms.html#e-mail" +- "-state-(type=email)):" - "/^[a-zA-Z0-9.!#$%&'*+/=?" - "^_`{|}~-]+@[a-zA-Z0-9](?:" - "[a-zA-Z0-9-]{0,61}[a-zA-Z0" @@ -5164,16 +5198,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Tag and attribute names are not limited to current HTML tags," - "so custom tags (and even, say, DocBook tags) may be used" - ".\n\nHere is the grammar for tags:" -- "A [tag name](@) consists of an ASCII letter" +- "A [tag name](@)" +- consists of an ASCII letter - "followed by zero or more ASCII letters, digits, or" - "hyphens (`-`)." - "An [attribute](@) consists of spaces, tabs, and up to one" - "line ending,\nan [attribute name], and an optional\n[attribute value specification" - "]." -- "An [attribute name](@)" -- "consists of an ASCII letter, `_`, or `:`, followed by zero" -- "or more ASCII\nletters, digits, `_`, `.`, `:`, or" -- "`-`" +- "An [attribute name](@)\nconsists of an ASCII letter, `_`" +- ", or `:`, followed by zero or more ASCII\nletters, digits," +- "`_`, `.`, `:`, or `-`" - ". (Note: This is the XML" - specification restricted to ASCII. HTML5 is laxer.) - "An [attribute value specification](@)" @@ -5188,32 +5222,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - is a nonempty string of characters not - "including spaces, tabs, line endings, `\"`, `'`, `=`, `<" - "`, `>`, or `` ` ``." -- "A [single-quoted attribute value](@)" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`." -- "A [double-quoted attribute value](@)" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`." +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final `'`." +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final `\"`." - "An [open tag](@) consists of a `<` character, a [" - "tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional" - "`/` character, and a `>` character." -- "A [closing tag](@) consists of the string ``." +- "A [closing tag](@) consists of the string ``." - "An [HTML comment](@) consists of ``, `" - "`, or ``, and `-->` (see the" - "[HTML spec](https://html.spec.whatwg.org/multipage/" - "parsing.html#markup-declaration-open-state))." -- "A [processing instruction](@)\nconsists of the string ``, and the string" +- "A [processing instruction](@)\nconsists of the string ``, and the string" - "`?>`." -- "A [declaration](@) consists of the string ``" +- "A [declaration](@) consists of the string ``" - ", and the character `>`." -- "A [CDATA section](@) consists of\nthe string ``" +- "A [CDATA section](@) consists of\nthe string ``" - ", and the string `]]>`." - "An [HTML tag](@) consists of an [open tag], a [" - "closing tag],\nan [HTML comment], a [processing instruction], a [declaration" @@ -5348,8 +5381,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\nfoo \nbaz\n.\n

    foo
    \nbaz

    " - "````````````````````````````````" -- "For a more visible alternative, a backslash before the" -- "[line ending] may be used instead of two or more spaces:" +- "For a more visible alternative, a backslash before the\n[line ending]" +- "may be used instead of two or more spaces:" - "````````````````````````````````" - "example\nfoo\\\nbaz\n.\n

    foo
    \nbaz

    " - "````````````````````````````````" @@ -5487,8 +5520,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Each line that is processed has an effect on this tree. The line is - "analyzed and, depending on its contents, the document may be altered" - "in one or more of the following ways:" -- "1. One or more open blocks may be closed.\n2." -- One or more new blocks may be created as children of the +- 1. One or more open blocks may be closed. +- 2. One or more new blocks may be created as children of the - last open block. - 3. Text may be added to the last (deepest) open block remaining - on the tree. @@ -5527,34 +5560,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "At the outset, our document model is just" - "``` tree\n-> document\n```\n\nThe first line of our text," - "``` markdown\n> Lorem ipsum dolor\n```" -- "causes a `block_quote` block to be created as a child of our" -- "open `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" +- "causes a `block_quote`" +- " block to be created as a child of our\nopen `document`" +- " block, and a `paragraph` block as a child of\nthe" +- "`block_quote`" - ". Then the text is added to the last open\nblock, the" - "`paragraph`:" - "``` tree\n-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\"\n```\n\nThe next line," - "``` markdown\nsit amet.\n```" -- "is a \"lazy continuation\" of the open `paragraph`, so it gets added" -- "to the paragraph's text:" +- "is a \"lazy continuation\" of the open `paragraph`" +- ", so it gets added\nto the paragraph's text:" - "``` tree\n-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\\nsit amet.\"\n```\n\nThe third line," - "``` markdown\n> - Qui *quodsi iracundia*\n```" -- "causes the `paragraph` block to be closed, and a new `list" -- "` block\nopened as a child of the `block_quote`. A" -- "`list_item` is also\nadded as a child of the `list`" -- ", and a `paragraph` as a child of\nthe `list_item`" -- ". The text is then added to the new `paragraph`:" +- "causes the `paragraph` block to be closed, and a new" +- "`list` block\nopened as a child of the `block_quote`" +- ". A `list_item` is also\nadded as a child of the" +- "`list`, and a `paragraph` as a child of\nthe" +- "`list_item`. The text is then added to the new `paragraph`:" - "``` tree\n-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-)" - " -> list_item\n -> paragraph" - " \"Qui *quodsi iracundia*\"\n```" - "The fourth line,\n\n``` markdown\n> - aliquando id\n```" -- "causes the `list_item` (and its child the `paragraph`) to" -- "be closed,\nand a new `list_item` opened up as child of the" -- "`list`. A `paragraph`\nis added as a child of the new" -- "`list_item`, to contain the text.\nWe thus obtain the final tree:" +- "causes the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" +- "opened up as child of the `list`. A `paragraph`" +- "is added as a child of the new `list_item`" +- ", to contain the text.\nWe thus obtain the final tree:" - "``` tree\n-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - " -> list (type=bullet tight=true bullet_char=-)\n list_item" @@ -5573,10 +5608,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " list (type=bullet tight=true bullet_char=-)\n list_item" - " paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item" -- " paragraph\n str \"aliquando id\"\n```" -- "Notice how the [line ending] in the first paragraph has" -- "been parsed as a `softbreak`, and the asterisks in the first list" -- "item\nhave become an `emph`." +- " paragraph\n str \"aliquando id\"\n```\n\nNotice how the" +- "[line ending] in the first paragraph has\nbeen parsed as a" +- "`softbreak`" +- ", and the asterisks in the first list item\nhave become an" +- "`emph`." - "### An algorithm for parsing nested emphasis and links" - "By far the trickiest part of inline parsing is handling emphasis," - "strong emphasis, links, and images. This is done using the following" @@ -5616,29 +5652,34 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "* We run *process emphasis* on these inlines, with the `[`" - "opener\n as `stack_bottom`." - "* We remove the opening delimiter." -- "* If we have a link (and not an image), we also set all" +- "*" +- "If we have a link (and not an image), we also set all" - "`[` delimiters before the opening delimiter to *inactive*" - ". (This\n will prevent us from getting links within links.)" - "#### *process emphasis*" -- "Parameter `stack_bottom` sets a lower bound to how far we" -- "descend in the [delimiter stack]. If it is NULL, we can" +- "Parameter `stack_bottom`" +- " sets a lower bound to how far we\ndescend in the [delimiter stack" +- "]. If it is NULL, we can" - "go all the way to the bottom. Otherwise, we stop before" - "visiting `stack_bottom`." -- "Let `current_position` point to the element on the [delimiter stack]" -- "just above `stack_bottom` (or the first element if `stack_bottom`" -- is NULL). -- "We keep track of the `openers_bottom` for each delimiter" -- "type (`*`, `_`" +- "Let `current_position`" +- " point to the element on the [delimiter stack]\njust above `stack_bottom`" +- " (or the first element if `stack_bottom`\nis NULL)." +- "We keep track of the `openers_bottom` for each delimiter\ntype (" +- "`*`, `_`" - "), indexed to the length of the closing delimiter run" - (modulo 3) and to whether the closing delimiter can also be an - "opener. Initialize this to `stack_bottom`." - "Then we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if needed)" +- "-" +- "Move `current_position`" +- forward in the delimiter stack (if needed) - "until we find the first potential closer with delimiter `*` or `_`" - ".\n (This will be the potential closer closest" - to the beginning of the input -- the first one in parse order.) -- "- Now, look back in the stack (staying above `stack_bottom`" -- "and\n the `openers_bottom`" +- "-" +- "Now, look back in the stack (staying above `stack_bottom`" +- " and\n the `openers_bottom`" - " for this delimiter type) for the\n first matching potential opener (\"matching\"" - " means same delimiter).\n\n- If one is found:" - "+ Figure out whether we have emphasis or strong emphasis:" @@ -5648,18 +5689,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the text node corresponding to the opener. - + Remove any delimiters between the opener and closer from - the delimiter stack. -- + Remove 1 (for regular emph) or 2 (for strong emph -- ) delimiters +- + +- Remove 1 (for regular emph) or 2 (for strong emph) +- delimiters - from the opening and closing text nodes. If they become empty - "as a result, remove them and remove the corresponding element" - "of the delimiter stack. If the closing node is removed, reset" - "`current_position` to the next element in the stack." - "- If none is found:" -- "+ Set `openers_bottom` to the element before `current_position`." +- + +- "Set `openers_bottom` to the element before `current_position`" +- "." - (We know that there are no openers for this kind of closer up to - and - "including this point, so this puts a lower bound on future searches.)" -- "+ If the closer at `current_position` is not a potential opener," +- + +- "If the closer at `current_position`" +- "is not a potential opener," - "remove it from the delimiter stack (since we know it can't" - be a closer either). - "+ Advance `current_position` to the next element in the stack." diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap index 3d9ca03..bbc7bae 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap @@ -163,7 +163,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "[id]: https://octodex.github.com/images/dojocat.jpg" - "\"The Dojocat\"\n```" - "Here's our logo (hover to see the title text):" -- "Inline-style:\n![" +- "Inline-style:" +- "![" - "alt text](https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 1\")" - "Reference-style:\n![alt text][logo]" @@ -185,8 +186,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Inline footnote^[Text of inline footnote] definition." - "Duplicated footnote reference[^second]." - "[^first]: Footnote **can have markup**\n\n and multiple paragraphs." -- "[^second]: Footnote text.\n```" -- "Footnote 1 link[^first].\n\nFootnote 2 link[^second]." +- "[^second]: Footnote text.\n```\n\nFootnote 1 link" +- "[^first].\n\nFootnote 2 link[^second]." - "Inline footnote^[Text of inline footnote] definition." - "Duplicated footnote reference[^second]." - "[^first]: Footnote **can have markup**\n\n and multiple paragraphs." @@ -306,18 +307,18 @@ input_file: tests/inputs/markdown/github_flavored.md - "> Blockquotes can also be nested..." - ">> ...by using additional greater-than signs right next to each other..." - "> > > ...or with spaces between arrows.\n```" -- "> Blockquotes are very handy in email to emulate reply text." -- "> This line is part of the same quote.\n\nQuote break." +- "> Blockquotes are very handy in email to emulate reply text.\n>" +- "This line is part of the same quote.\n\nQuote break." - "> This is a very long line that will still be quoted properly when it wraps" - ". Oh boy let'" - s keep writing to make sure this is long enough to actually wrap for everyone. - "Oh, you can *put* **Markdown** into a blockquote." -- "> Blockquotes can also be nested..." -- ">" -- "> ...by using additional greater-than signs right next to each other..." -- "> > > ...or with spaces between arrows.\n\n------" +- "> Blockquotes can also be nested...\n>" +- "> ...by using additional greater-than signs right next to each other...\n> >" +- "> ...or with spaces between arrows.\n\n------" - "# Inline HTML" -- "```\n
    \n
    Definition list
    " +- "```" +- "
    \n
    Definition list
    " - "
    Is something people use sometimes.
    " - "
    Markdown in HTML
    " - "
    Does *not* work **very** well." @@ -328,11 +329,13 @@ input_file: tests/inputs/markdown/github_flavored.md - "
    Does *not* work **very** well." - "Use HTML tags.
    \n
    \n\n------" - "# Horizontal Rules" -- "```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks" -- "___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n\n------" +- "```" +- "Three or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___" +- "Underscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***" +- "Asterisks\n\n___\n\nUnderscores\n\n------" - "# YouTube Videos" -- "```\n" - "\"IMAGE\n" -- "```\n[![" +- "```" +- "[![" - "IMAGE ALT TEXT HERE](http://img.youtube.com/vi/" - "YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com" - "/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```" -- "[![" -- "IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e" -- /ef/YouTube_logo_2015.svg/1200px- -- "YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?" +- "[![IMAGE ALT TEXT HERE" +- "](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/" +- YouTube_logo_2015.svg/1200px-YouTube_logo_2015 +- ".svg.png)](https://www.youtube.com/watch?" - v=ciawICBvQoE) From 21e0dd960419f4792c6b24889f88775b33a929d8 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Mon, 25 Mar 2024 19:57:55 +0100 Subject: [PATCH 6/6] Remove Text level from markdown levels By having inline elements at a higher level than text, it caused for some strange breakpoints. While an inline element can have a text element inside it, this should then be skipped if necessary. It still allows inline elements to be kept together, but also allow for smaller text elements to still get pulled in. This also makes sure that higher semantic levels get preferred if they are shorter in length than a lower level, to avoid the algorithm stopping sooner for a lower level, when a higher level could fit. This also adds an optimization to drop ranges from the caches if we have already moved past them, since these ranges get iterated over quite frequently. --- benches/output.txt | 428 +- src/lib.rs | 26 +- src/markdown.rs | 148 +- src/text.rs | 5 + ...ggingface_markdown@commonmark_spec.md.snap | 1648 +++---- ...ingface_markdown@github_flavored.md-2.snap | 23 +- ...ggingface_markdown@github_flavored.md.snap | 145 +- ...face_markdown_trim@commonmark_spec.md.snap | 1362 +++--- ...ce_markdown_trim@github_flavored.md-2.snap | 23 +- ...face_markdown_trim@github_flavored.md.snap | 131 +- ...pshots__markdown@commonmark_spec.md-2.snap | 414 +- ...napshots__markdown@commonmark_spec.md.snap | 4012 +++++++++-------- ...pshots__markdown@github_flavored.md-2.snap | 49 +- ...napshots__markdown@github_flavored.md.snap | 310 +- ...s__markdown_trim@commonmark_spec.md-2.snap | 385 +- ...ots__markdown_trim@commonmark_spec.md.snap | 2931 ++++++------ ...s__markdown_trim@github_flavored.md-2.snap | 48 +- ...ots__markdown_trim@github_flavored.md.snap | 235 +- ...tiktoken_default@room_with_a_view.txt.snap | 20 +- ..._tiktoken_markdown@commonmark_spec.md.snap | 1516 ++++--- ...iktoken_markdown@github_flavored.md-2.snap | 9 +- ..._tiktoken_markdown@github_flavored.md.snap | 126 +- ...oken_markdown_trim@commonmark_spec.md.snap | 1258 +++--- ...en_markdown_trim@github_flavored.md-2.snap | 11 +- ...oken_markdown_trim@github_flavored.md.snap | 107 +- 25 files changed, 8123 insertions(+), 7247 deletions(-) diff --git a/benches/output.txt b/benches/output.txt index 42e76d3..178813a 100644 --- a/benches/output.txt +++ b/benches/output.txt @@ -7,38 +7,38 @@ chunk_size fastest │ slowest │ median ├─ markdown │ │ │ │ │ │ ├─ characters │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 324 ms │ 336.2 ms │ 331.4 ms │ 331.4 ms │ 100 │ 100 -│ │ │ 632.6 KB/s │ 609.7 KB/s │ 618.4 KB/s │ 618.6 KB/s │ │ +│ │ │ ╰─ commonmark_spec 297.3 ms │ 322.7 ms │ 300.7 ms │ 301.3 ms │ 100 │ 100 +│ │ │ 689.6 KB/s │ 635.1 KB/s │ 681.7 KB/s │ 680.4 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 0 │ 13479 │ 13479 │ 13344 │ │ -│ │ │ 0 B │ 77.19 MB │ 77.19 MB │ 76.42 MB │ │ +│ │ │ 14088 │ 0 │ 14088 │ 13947 │ │ +│ │ │ 80.3 MB │ 0 B │ 80.3 MB │ 79.5 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 0 │ 13480 │ 13480 │ 13345 │ │ -│ │ │ 0 B │ 285.8 MB │ 285.8 MB │ 283 MB │ │ +│ │ │ 14089 │ 0 │ 14089 │ 13948 │ │ +│ │ │ 297.7 MB │ 0 B │ 297.7 MB │ 294.7 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 0 │ 45433 │ 45433 │ 44978 │ │ -│ │ │ 0 B │ 208.4 MB │ 208.4 MB │ 206.3 MB │ │ +│ │ │ 47478 │ 0 │ 47478 │ 47003 │ │ +│ │ │ 217.2 MB │ 0 B │ 217.2 MB │ 215 MB │ │ │ │ │ shrink: │ │ │ │ │ -│ │ │ 0 │ 13 │ 13 │ 12.87 │ │ -│ │ │ 0 B │ 94 B │ 94 B │ 93.06 B │ │ +│ │ │ 13 │ 0 │ 13 │ 12.87 │ │ +│ │ │ 94 B │ 0 B │ 94 B │ 93.06 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 47.13 ms │ 47.96 ms │ 47.33 ms │ 47.35 ms │ 100 │ 100 -│ │ │ 4.349 MB/s │ 4.274 MB/s │ 4.331 MB/s │ 4.329 MB/s │ │ +│ │ │ ╰─ commonmark_spec 37.85 ms │ 39.52 ms │ 38.56 ms │ 38.65 ms │ 100 │ 100 +│ │ │ 5.416 MB/s │ 5.187 MB/s │ 5.316 MB/s │ 5.304 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 1617 │ 1617 │ 1617 │ 1617 │ │ -│ │ │ 9.47 MB │ 9.47 MB │ 9.47 MB │ 9.47 MB │ │ +│ │ │ 1614 │ 1614 │ 1614 │ 1614 │ │ +│ │ │ 9.472 MB │ 9.472 MB │ 9.472 MB │ 9.472 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 1618 │ 1618 │ 1618 │ 1618 │ │ -│ │ │ 35.45 MB │ 35.45 MB │ 35.45 MB │ 35.45 MB │ │ +│ │ │ 1615 │ 1615 │ 1615 │ 1615 │ │ +│ │ │ 35.51 MB │ 35.51 MB │ 35.51 MB │ 35.51 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 5391 │ 5391 │ 5391 │ 5391 │ │ -│ │ │ 25.77 MB │ 25.77 MB │ 25.77 MB │ 25.77 MB │ │ +│ │ │ 5382 │ 5382 │ 5382 │ 5382 │ │ +│ │ │ 25.84 MB │ 25.84 MB │ 25.84 MB │ 25.84 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 8.49 ms │ 8.972 ms │ 8.594 ms │ 8.595 ms │ 100 │ 100 -│ │ │ 24.14 MB/s │ 22.85 MB/s │ 23.85 MB/s │ 23.85 MB/s │ │ +│ │ │ ╰─ commonmark_spec 6.729 ms │ 7.102 ms │ 6.82 ms │ 6.837 ms │ 100 │ 100 +│ │ │ 30.46 MB/s │ 28.86 MB/s │ 30.06 MB/s │ 29.98 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 260 │ 260 │ 260 │ 260 │ │ │ │ │ 1.663 MB │ 1.663 MB │ 1.663 MB │ 1.663 MB │ │ @@ -52,8 +52,8 @@ chunk_size fastest │ slowest │ median │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 2.008 ms │ 2.2 ms │ 2.082 ms │ 2.082 ms │ 100 │ 100 -│ │ 102 MB/s │ 93.16 MB/s │ 98.47 MB/s │ 98.47 MB/s │ │ +│ │ ╰─ commonmark_spec 1.716 ms │ 1.931 ms │ 1.772 ms │ 1.774 ms │ 100 │ 100 +│ │ 119.4 MB/s │ 106.1 MB/s │ 115.6 MB/s │ 115.5 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 65 │ 65 │ 65 │ 65 │ │ │ │ 528.4 KB │ 528.4 KB │ 528.4 KB │ 528.4 KB │ │ @@ -68,122 +68,122 @@ chunk_size fastest │ slowest │ median │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ tiktoken │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 866 ms │ 956.2 ms │ 872.7 ms │ 874.2 ms │ 100 │ 100 -│ │ │ 236.7 KB/s │ 214.3 KB/s │ 234.9 KB/s │ 234.5 KB/s │ │ +│ │ │ ╰─ commonmark_spec 838.2 ms │ 863.3 ms │ 844.7 ms │ 847.4 ms │ 100 │ 100 +│ │ │ 244.5 KB/s │ 237.4 KB/s │ 242.6 KB/s │ 241.9 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 8147015 │ 8147015 │ 8147015 │ 8147015 │ │ -│ │ │ 397 MB │ 397 MB │ 397 MB │ 397 MB │ │ +│ │ │ 8080680 │ 8080680 │ 8080680 │ 8080680 │ │ +│ │ │ 394.7 MB │ 394.7 MB │ 394.7 MB │ 394.7 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 8458523 │ 8458523 │ 8458523 │ 8458523 │ │ -│ │ │ 750.9 MB │ 750.9 MB │ 750.9 MB │ 750.9 MB │ │ +│ │ │ 8392188 │ 8392188 │ 8392188 │ 8392188 │ │ +│ │ │ 748.8 MB │ 748.8 MB │ 748.8 MB │ 748.8 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 1478494 │ 1478494 │ 1478494 │ 1478494 │ │ -│ │ │ 335.3 MB │ 335.3 MB │ 335.3 MB │ 335.3 MB │ │ +│ │ │ 1466095 │ 1466095 │ 1466095 │ 1466095 │ │ +│ │ │ 335.6 MB │ 335.6 MB │ 335.6 MB │ 335.6 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 289.4 ms │ 295 ms │ 290.6 ms │ 291.1 ms │ 100 │ 100 -│ │ │ 708.4 KB/s │ 694.7 KB/s │ 705.2 KB/s │ 704.1 KB/s │ │ +│ │ │ ╰─ commonmark_spec 287.9 ms │ 291 ms │ 288.6 ms │ 288.8 ms │ 100 │ 100 +│ │ │ 711.9 KB/s │ 704.3 KB/s │ 710.1 KB/s │ 709.8 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 2960718 │ 2960718 │ 2960718 │ 2960718 │ │ -│ │ │ 138.8 MB │ 138.8 MB │ 138.8 MB │ 138.8 MB │ │ +│ │ │ 2956753 │ 2956753 │ 2956753 │ 2956753 │ │ +│ │ │ 138.6 MB │ 138.6 MB │ 138.6 MB │ 138.6 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 3272226 │ 3272226 │ 3272226 │ 3272226 │ │ -│ │ │ 261.6 MB │ 261.6 MB │ 261.6 MB │ 261.6 MB │ │ +│ │ │ 3268261 │ 3268261 │ 3268261 │ 3268261 │ │ +│ │ │ 261.3 MB │ 261.3 MB │ 261.3 MB │ 261.3 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 552129 │ 552129 │ 552129 │ 552129 │ │ -│ │ │ 104.2 MB │ 104.2 MB │ 104.2 MB │ 104.2 MB │ │ +│ │ │ 551196 │ 551196 │ 551196 │ 551196 │ │ +│ │ │ 104.1 MB │ 104.1 MB │ 104.1 MB │ 104.1 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 156.7 ms │ 159.7 ms │ 157.4 ms │ 157.4 ms │ 100 │ 100 -│ │ │ 1.308 MB/s │ 1.283 MB/s │ 1.302 MB/s │ 1.302 MB/s │ │ +│ │ │ ╰─ commonmark_spec 157 ms │ 159.7 ms │ 157.5 ms │ 157.5 ms │ 100 │ 100 +│ │ │ 1.305 MB/s │ 1.283 MB/s │ 1.301 MB/s │ 1.301 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 1652427 │ 1652427 │ 1652427 │ 1652427 │ │ -│ │ │ 76.53 MB │ 76.53 MB │ 76.53 MB │ 76.53 MB │ │ +│ │ │ 1651804 │ 1651804 │ 1651804 │ 1651804 │ │ +│ │ │ 76.5 MB │ 76.5 MB │ 76.5 MB │ 76.5 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 1963935 │ 1963935 │ 1963935 │ 1963935 │ │ +│ │ │ 1963312 │ 1963312 │ 1963312 │ 1963312 │ │ │ │ │ 150.6 MB │ 150.6 MB │ 150.6 MB │ 150.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 308404 │ 308404 │ 308404 │ 308404 │ │ -│ │ │ 55.63 MB │ 55.63 MB │ 55.63 MB │ 55.63 MB │ │ +│ │ │ 308264 │ 308264 │ 308264 │ 308264 │ │ +│ │ │ 55.62 MB │ 55.62 MB │ 55.62 MB │ 55.62 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 72.69 ms │ 74.22 ms │ 73.04 ms │ 73.14 ms │ 100 │ 100 -│ │ 2.82 MB/s │ 2.762 MB/s │ 2.807 MB/s │ 2.802 MB/s │ │ +│ │ ╰─ commonmark_spec 73.1 ms │ 74.41 ms │ 73.83 ms │ 73.77 ms │ 100 │ 100 +│ │ 2.804 MB/s │ 2.755 MB/s │ 2.776 MB/s │ 2.779 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 750087 │ 750087 │ 750087 │ 750087 │ │ +│ │ 750031 │ 750031 │ 750031 │ 750031 │ │ │ │ 34.96 MB │ 34.96 MB │ 34.96 MB │ 34.96 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 1061595 │ 1061595 │ 1061595 │ 1061595 │ │ +│ │ 1061539 │ 1061539 │ 1061539 │ 1061539 │ │ │ │ 78.87 MB │ 78.87 MB │ 78.87 MB │ 78.87 MB │ │ │ │ grow: │ │ │ │ │ -│ │ 141696 │ 141696 │ 141696 │ 141696 │ │ +│ │ 141689 │ 141689 │ 141689 │ 141689 │ │ │ │ 25.39 MB │ 25.39 MB │ 25.39 MB │ 25.39 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ tokenizers │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ ╰─ commonmark_spec 1.367 s │ 1.415 s │ 1.379 s │ 1.381 s │ 100 │ 100 -│ │ 149.8 KB/s │ 144.8 KB/s │ 148.6 KB/s │ 148.4 KB/s │ │ +│ │ ╰─ commonmark_spec 1.467 s │ 1.537 s │ 1.48 s │ 1.482 s │ 100 │ 100 +│ │ 139.6 KB/s │ 133.3 KB/s │ 138.5 KB/s │ 138.2 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 32934478 │ 32934478 │ 32934478 │ 32934478 │ │ -│ │ 3.172 GB │ 3.172 GB │ 3.172 GB │ 3.172 GB │ │ +│ │ 36199855 │ 36199855 │ 36199855 │ 36199855 │ │ +│ │ 3.467 GB │ 3.467 GB │ 3.467 GB │ 3.467 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 32992531 │ 32992531 │ 32992531 │ 32992531 │ │ -│ │ 4.959 GB │ 4.959 GB │ 4.959 GB │ 4.959 GB │ │ +│ │ 36257908 │ 36257908 │ 36257908 │ 36257908 │ │ +│ │ 5.442 GB │ 5.442 GB │ 5.442 GB │ 5.442 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 1210155 │ 1210155 │ 1210155 │ 1210155 │ │ -│ │ 1.782 GB │ 1.782 GB │ 1.782 GB │ 1.782 GB │ │ +│ │ 1259076 │ 1259076 │ 1259076 │ 1259076 │ │ +│ │ 1.969 GB │ 1.969 GB │ 1.969 GB │ 1.969 GB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 512 │ │ │ │ │ -│ │ ╰─ commonmark_spec 621.5 ms │ 657.1 ms │ 630 ms │ 630.5 ms │ 100 │ 100 -│ │ 329.8 KB/s │ 311.9 KB/s │ 325.3 KB/s │ 325.1 KB/s │ │ +│ │ ╰─ commonmark_spec 615.1 ms │ 702 ms │ 623.1 ms │ 630.2 ms │ 100 │ 100 +│ │ 333.2 KB/s │ 292 KB/s │ 329 KB/s │ 325.3 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 16110533 │ 16110533 │ 16110533 │ 16110533 │ │ -│ │ 1.581 GB │ 1.581 GB │ 1.581 GB │ 1.581 GB │ │ +│ │ 16094165 │ 16094165 │ 16094165 │ 16094165 │ │ +│ │ 1.579 GB │ 1.579 GB │ 1.579 GB │ 1.579 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 16168586 │ 16168586 │ 16168586 │ 16168586 │ │ -│ │ 2.442 GB │ 2.442 GB │ 2.442 GB │ 2.442 GB │ │ +│ │ 16152218 │ 16152218 │ 16152218 │ 16152218 │ │ +│ │ 2.44 GB │ 2.44 GB │ 2.44 GB │ 2.44 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 401309 │ 401309 │ 401309 │ 401309 │ │ -│ │ 856 MB │ 856 MB │ 856 MB │ 856 MB │ │ +│ │ 398917 │ 398917 │ 398917 │ 398917 │ │ +│ │ 855.5 MB │ 855.5 MB │ 855.5 MB │ 855.5 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 4096 │ │ │ │ │ -│ │ ╰─ commonmark_spec 324 ms │ 329.6 ms │ 325.8 ms │ 326.3 ms │ 100 │ 100 -│ │ 632.7 KB/s │ 622 KB/s │ 629.1 KB/s │ 628.3 KB/s │ │ +│ │ ╰─ commonmark_spec 320.8 ms │ 329.6 ms │ 322.5 ms │ 323.9 ms │ 100 │ 100 +│ │ 639 KB/s │ 621.9 KB/s │ 635.6 KB/s │ 632.7 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 8494397 │ 8494397 │ 8494397 │ 8494397 │ │ -│ │ 843.6 MB │ 843.6 MB │ 843.6 MB │ 843.6 MB │ │ +│ │ 8490533 │ 8490533 │ 8490533 │ 8490533 │ │ +│ │ 843.2 MB │ 843.2 MB │ 843.2 MB │ 843.2 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 8552450 │ 8552450 │ 8552450 │ 8552450 │ │ +│ │ 8548586 │ 8548586 │ 8548586 │ 8548586 │ │ │ │ 1.292 GB │ 1.292 GB │ 1.292 GB │ 1.292 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 168534 │ 168534 │ 168534 │ 168534 │ │ -│ │ 444.3 MB │ 444.3 MB │ 444.3 MB │ 444.3 MB │ │ +│ │ 167879 │ 167879 │ 167879 │ 167879 │ │ +│ │ 444.2 MB │ 444.2 MB │ 444.2 MB │ 444.2 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ 32768 │ │ │ │ │ -│ ╰─ commonmark_spec 177.4 ms │ 186.2 ms │ 181.3 ms │ 180.9 ms │ 100 │ 100 -│ 1.155 MB/s │ 1.1 MB/s │ 1.13 MB/s │ 1.132 MB/s │ │ +│ ╰─ commonmark_spec 176.3 ms │ 183.5 ms │ 177.6 ms │ 179 ms │ 100 │ 100 +│ 1.162 MB/s │ 1.116 MB/s │ 1.154 MB/s │ 1.145 MB/s │ │ │ alloc: │ │ │ │ │ -│ 4580000 │ 4580000 │ 4580000 │ 4580000 │ │ -│ 460.3 MB │ 460.3 MB │ 460.3 MB │ 460.3 MB │ │ +│ 4579500 │ 4579500 │ 4579500 │ 4579500 │ │ +│ 460.2 MB │ 460.2 MB │ 460.2 MB │ 460.2 MB │ │ │ dealloc: │ │ │ │ │ -│ 4638053 │ 4638053 │ 4638053 │ 4638053 │ │ -│ 698.6 MB │ 698.6 MB │ 698.6 MB │ 698.6 MB │ │ +│ 4637553 │ 4637553 │ 4637553 │ 4637553 │ │ +│ 698.5 MB │ 698.5 MB │ 698.5 MB │ 698.5 MB │ │ │ grow: │ │ │ │ │ -│ 79598 │ 79598 │ 79598 │ 79598 │ │ +│ 79534 │ 79534 │ 79534 │ 79534 │ │ │ 233.4 MB │ 233.4 MB │ 233.4 MB │ 233.4 MB │ │ │ shrink: │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ @@ -191,22 +191,22 @@ chunk_size fastest │ slowest │ median ╰─ text │ │ │ │ │ ├─ characters │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 222.8 ms │ 226 ms │ 223.2 ms │ 223.4 ms │ 100 │ 100 - │ │ │ 734.3 KB/s │ 723.7 KB/s │ 732.7 KB/s │ 732.3 KB/s │ │ + │ │ ├─ romeo_and_juliet 195.7 ms │ 197.7 ms │ 196.2 ms │ 196.3 ms │ 100 │ 100 + │ │ │ 835.9 KB/s │ 827.4 KB/s │ 833.7 KB/s │ 833.4 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 11187 │ 11187 │ 11187 │ 11189 │ │ - │ │ │ 32.32 MB │ 32.32 MB │ 32.32 MB │ 32.32 MB │ │ + │ │ │ 11187 │ 11463 │ 11187 │ 11189 │ │ + │ │ │ 32.32 MB │ 32.34 MB │ 32.32 MB │ 32.32 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 11188 │ 11188 │ 11188 │ 11189 │ │ + │ │ │ 11188 │ 11384 │ 11188 │ 11189 │ │ │ │ │ 121.8 MB │ 121.8 MB │ 121.8 MB │ 121.8 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 33447 │ 33447 │ 33447 │ 33447 │ │ - │ │ │ 89.36 MB │ 89.36 MB │ 89.36 MB │ 89.36 MB │ │ + │ │ │ 33447 │ 33486 │ 33447 │ 33447 │ │ + │ │ │ 89.36 MB │ 89.37 MB │ 89.36 MB │ 89.36 MB │ │ │ │ │ shrink: │ │ │ │ │ - │ │ │ 0 │ 0 │ 0 │ 0.05 │ │ - │ │ │ 0 B │ 0 B │ 0 B │ 23.4 B │ │ - │ │ ╰─ room_with_a_view 173.7 ms │ 176.9 ms │ 174.1 ms │ 174.2 ms │ 100 │ 100 - │ │ 1.737 MB/s │ 1.706 MB/s │ 1.733 MB/s │ 1.732 MB/s │ │ + │ │ │ 0 │ 5 │ 0 │ 0.05 │ │ + │ │ │ 0 B │ 2.34 KB │ 0 B │ 23.4 B │ │ + │ │ ╰─ room_with_a_view 159.8 ms │ 176.9 ms │ 170.7 ms │ 167.2 ms │ 100 │ 100 + │ │ 1.888 MB/s │ 1.705 MB/s │ 1.767 MB/s │ 1.805 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 18429 │ 18429 │ 18429 │ 18429 │ │ │ │ 26.32 MB │ 26.32 MB │ 26.32 MB │ 26.32 MB │ │ @@ -217,8 +217,8 @@ chunk_size fastest │ slowest │ median │ │ 48813 │ 48813 │ 48813 │ 48813 │ │ │ │ 66.19 MB │ 66.19 MB │ 66.19 MB │ 66.19 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 24.97 ms │ 27.09 ms │ 25.24 ms │ 25.26 ms │ 100 │ 100 - │ │ │ 6.55 MB/s │ 6.037 MB/s │ 6.481 MB/s │ 6.476 MB/s │ │ + │ │ ├─ romeo_and_juliet 24.92 ms │ 25.57 ms │ 25.03 ms │ 25.04 ms │ 100 │ 100 + │ │ │ 6.564 MB/s │ 6.397 MB/s │ 6.536 MB/s │ 6.532 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 1199 │ 1199 │ 1199 │ 1199 │ │ │ │ │ 3.479 MB │ 3.479 MB │ 3.479 MB │ 3.479 MB │ │ @@ -228,8 +228,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 3593 │ 3593 │ 3593 │ 3593 │ │ │ │ │ 9.941 MB │ 9.941 MB │ 9.941 MB │ 9.941 MB │ │ - │ │ ╰─ room_with_a_view 25.95 ms │ 26.93 ms │ 26.14 ms │ 26.16 ms │ 100 │ 100 - │ │ 11.63 MB/s │ 11.2 MB/s │ 11.54 MB/s │ 11.53 MB/s │ │ + │ │ ╰─ room_with_a_view 26.07 ms │ 28.08 ms │ 26.21 ms │ 26.24 ms │ 100 │ 100 + │ │ 11.57 MB/s │ 10.75 MB/s │ 11.51 MB/s │ 11.5 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 2348 │ 2348 │ 2348 │ 2348 │ │ │ │ 3.353 MB │ 3.353 MB │ 3.353 MB │ 3.353 MB │ │ @@ -240,8 +240,8 @@ chunk_size fastest │ slowest │ median │ │ 6217 │ 6217 │ 6217 │ 6217 │ │ │ │ 8.522 MB │ 8.522 MB │ 8.522 MB │ 8.522 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 3.745 ms │ 4.251 ms │ 3.89 ms │ 3.896 ms │ 100 │ 100 - │ │ │ 43.68 MB/s │ 38.48 MB/s │ 42.05 MB/s │ 41.98 MB/s │ │ + │ │ ├─ romeo_and_juliet 3.763 ms │ 3.954 ms │ 3.803 ms │ 3.809 ms │ 100 │ 100 + │ │ │ 43.48 MB/s │ 41.37 MB/s │ 43.02 MB/s │ 42.94 MB/s │ │ │ │ │ alloc: │ │ │ │ │ │ │ │ 140 │ 140 │ 140 │ 140 │ │ │ │ │ 406.1 KB │ 406.1 KB │ 406.1 KB │ 406.1 KB │ │ @@ -251,8 +251,8 @@ chunk_size fastest │ slowest │ median │ │ │ grow: │ │ │ │ │ │ │ │ 424 │ 424 │ 424 │ 424 │ │ │ │ │ 1.42 MB │ 1.42 MB │ 1.42 MB │ 1.42 MB │ │ - │ │ ╰─ room_with_a_view 5.155 ms │ 5.47 ms │ 5.237 ms │ 5.244 ms │ 100 │ 100 - │ │ 58.56 MB/s │ 55.19 MB/s │ 57.64 MB/s │ 57.57 MB/s │ │ + │ │ ╰─ room_with_a_view 5.099 ms │ 5.402 ms │ 5.162 ms │ 5.16 ms │ 100 │ 100 + │ │ 59.2 MB/s │ 55.88 MB/s │ 58.48 MB/s │ 58.5 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 303 │ 303 │ 303 │ 303 │ │ │ │ 430 KB │ 430 KB │ 430 KB │ 430 KB │ │ @@ -263,8 +263,8 @@ chunk_size fastest │ slowest │ median │ │ 810 │ 810 │ 810 │ 810 │ │ │ │ 1.154 MB │ 1.154 MB │ 1.154 MB │ 1.154 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.584 ms │ 1.774 ms │ 1.66 ms │ 1.659 ms │ 100 │ 100 - │ │ 103.2 MB/s │ 92.2 MB/s │ 98.51 MB/s │ 98.58 MB/s │ │ + │ ├─ romeo_and_juliet 1.6 ms │ 1.764 ms │ 1.624 ms │ 1.626 ms │ 100 │ 100 + │ │ 102.2 MB/s │ 92.73 MB/s │ 100.7 MB/s │ 100.5 MB/s │ │ │ │ alloc: │ │ │ │ │ │ │ 32 │ 32 │ 32 │ 32 │ │ │ │ 106.9 KB │ 106.9 KB │ 106.9 KB │ 106.9 KB │ │ @@ -274,8 +274,8 @@ chunk_size fastest │ slowest │ median │ │ grow: │ │ │ │ │ │ │ 105 │ 105 │ 105 │ 105 │ │ │ │ 597.5 KB │ 597.5 KB │ 597.5 KB │ 597.5 KB │ │ - │ ╰─ room_with_a_view 1.883 ms │ 2.02 ms │ 1.897 ms │ 1.902 ms │ 100 │ 100 - │ 160.2 MB/s │ 149.4 MB/s │ 159.1 MB/s │ 158.7 MB/s │ │ + │ ╰─ room_with_a_view 1.839 ms │ 2.015 ms │ 1.885 ms │ 1.878 ms │ 100 │ 100 + │ 164.1 MB/s │ 149.8 MB/s │ 160.1 MB/s │ 160.6 MB/s │ │ │ alloc: │ │ │ │ │ │ 41 │ 41 │ 41 │ 41 │ │ │ 55.95 KB │ 55.95 KB │ 55.95 KB │ 55.95 KB │ │ @@ -287,188 +287,188 @@ chunk_size fastest │ slowest │ median │ 206.2 KB │ 206.2 KB │ 206.2 KB │ 206.2 KB │ │ ├─ tiktoken │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 798 ms │ 870.9 ms │ 801.1 ms │ 806.8 ms │ 100 │ 100 - │ │ │ 205 KB/s │ 187.8 KB/s │ 204.2 KB/s │ 202.7 KB/s │ │ + │ │ ├─ romeo_and_juliet 798.7 ms │ 813 ms │ 806.1 ms │ 805.7 ms │ 100 │ 100 + │ │ │ 204.8 KB/s │ 201.2 KB/s │ 202.9 KB/s │ 203 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 8687900 │ 8687900 │ 8687900 │ 8687900 │ │ - │ │ │ 413.2 MB │ 413.2 MB │ 413.2 MB │ 413.2 MB │ │ + │ │ │ 8665256 │ 8665256 │ 8665256 │ 8665256 │ │ + │ │ │ 412.1 MB │ 412.1 MB │ 412.1 MB │ 412.1 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 8999408 │ 8999408 │ 8999408 │ 8999408 │ │ - │ │ │ 676.9 MB │ 676.9 MB │ 676.9 MB │ 676.9 MB │ │ + │ │ │ 8976764 │ 8976764 │ 8976764 │ 8976764 │ │ + │ │ │ 675.4 MB │ 675.4 MB │ 675.4 MB │ 675.4 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 1801554 │ 1801554 │ 1801554 │ 1801554 │ │ - │ │ │ 245.2 MB │ 245.2 MB │ 245.2 MB │ 245.2 MB │ │ - │ │ ╰─ room_with_a_view 1.057 s │ 1.096 s │ 1.062 s │ 1.063 s │ 100 │ 100 - │ │ 285.6 KB/s │ 275.2 KB/s │ 284 KB/s │ 284 KB/s │ │ + │ │ │ 1797014 │ 1797014 │ 1797014 │ 1797014 │ │ + │ │ │ 244.7 MB │ 244.7 MB │ 244.7 MB │ 244.7 MB │ │ + │ │ ╰─ room_with_a_view 1.056 s │ 1.109 s │ 1.066 s │ 1.068 s │ 100 │ 100 + │ │ 285.8 KB/s │ 272.1 KB/s │ 282.9 KB/s │ 282.4 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 11500302 │ 11500302 │ 11500302 │ 11500302 │ │ - │ │ 551.9 MB │ 551.9 MB │ 551.9 MB │ 551.9 MB │ │ + │ │ 11472402 │ 11472402 │ 11472402 │ 11472402 │ │ + │ │ 550.5 MB │ 550.5 MB │ 550.5 MB │ 550.5 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 11811810 │ 11811810 │ 11811810 │ 11811810 │ │ - │ │ 941.2 MB │ 941.2 MB │ 941.2 MB │ 941.2 MB │ │ + │ │ 11783910 │ 11783910 │ 11783910 │ 11783910 │ │ + │ │ 939 MB │ 939 MB │ 939 MB │ 939 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 2834268 │ 2834268 │ 2834268 │ 2834268 │ │ - │ │ 370.6 MB │ 370.6 MB │ 370.6 MB │ 370.6 MB │ │ + │ │ 2826842 │ 2826842 │ 2826842 │ 2826842 │ │ + │ │ 369.8 MB │ 369.8 MB │ 369.8 MB │ 369.8 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 260.3 ms │ 263.4 ms │ 261.2 ms │ 261.3 ms │ 100 │ 100 - │ │ │ 628.4 KB/s │ 621.1 KB/s │ 626.2 KB/s │ 626 KB/s │ │ + │ │ ├─ romeo_and_juliet 262.9 ms │ 267.2 ms │ 264.8 ms │ 264.8 ms │ 100 │ 100 + │ │ │ 622.1 KB/s │ 612.1 KB/s │ 617.8 KB/s │ 617.7 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2921739 │ 2921739 │ 2921739 │ 2921739 │ │ + │ │ │ 2919900 │ 2919900 │ 2919900 │ 2919900 │ │ │ │ │ 137.3 MB │ 137.3 MB │ 137.3 MB │ 137.3 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 3233247 │ 3233247 │ 3233247 │ 3233247 │ │ - │ │ │ 234.4 MB │ 234.4 MB │ 234.4 MB │ 234.4 MB │ │ + │ │ │ 3231408 │ 3231408 │ 3231408 │ 3231408 │ │ + │ │ │ 234.3 MB │ 234.3 MB │ 234.3 MB │ 234.3 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 606067 │ 606067 │ 606067 │ 606067 │ │ - │ │ │ 78.61 MB │ 78.61 MB │ 78.61 MB │ 78.61 MB │ │ - │ │ ╰─ room_with_a_view 438.9 ms │ 444.8 ms │ 440.6 ms │ 440.7 ms │ 100 │ 100 - │ │ 687.8 KB/s │ 678.7 KB/s │ 685.1 KB/s │ 684.9 KB/s │ │ + │ │ │ 605774 │ 605774 │ 605774 │ 605774 │ │ + │ │ │ 78.58 MB │ 78.58 MB │ 78.58 MB │ 78.58 MB │ │ + │ │ ╰─ room_with_a_view 441.8 ms │ 474.8 ms │ 444.3 ms │ 445 ms │ 100 │ 100 + │ │ 683.2 KB/s │ 635.7 KB/s │ 679.3 KB/s │ 678.3 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 4881127 │ 4881127 │ 4881127 │ 4881127 │ │ - │ │ 232.3 MB │ 232.3 MB │ 232.3 MB │ 232.3 MB │ │ + │ │ 4878812 │ 4878812 │ 4878812 │ 4878812 │ │ + │ │ 232.2 MB │ 232.2 MB │ 232.2 MB │ 232.2 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 5192635 │ 5192635 │ 5192635 │ 5192635 │ │ - │ │ 403.9 MB │ 403.9 MB │ 403.9 MB │ 403.9 MB │ │ + │ │ 5190320 │ 5190320 │ 5190320 │ 5190320 │ │ + │ │ 403.7 MB │ 403.7 MB │ 403.7 MB │ 403.7 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 1197367 │ 1197367 │ 1197367 │ 1197367 │ │ - │ │ 152.9 MB │ 152.9 MB │ 152.9 MB │ 152.9 MB │ │ + │ │ 1196748 │ 1196748 │ 1196748 │ 1196748 │ │ + │ │ 152.8 MB │ 152.8 MB │ 152.8 MB │ 152.8 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 177.8 ms │ 181 ms │ 178.7 ms │ 178.8 ms │ 100 │ 100 - │ │ │ 919.8 KB/s │ 903.6 KB/s │ 915.3 KB/s │ 914.7 KB/s │ │ + │ │ ├─ romeo_and_juliet 179.6 ms │ 183.5 ms │ 180.5 ms │ 180.7 ms │ 100 │ 100 + │ │ │ 910.6 KB/s │ 891.5 KB/s │ 906.1 KB/s │ 905.2 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2018345 │ 2018345 │ 2018345 │ 2018345 │ │ - │ │ │ 94.51 MB │ 94.51 MB │ 94.51 MB │ 94.51 MB │ │ + │ │ │ 2017987 │ 2017987 │ 2017987 │ 2017987 │ │ + │ │ │ 94.49 MB │ 94.49 MB │ 94.49 MB │ 94.49 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 2329853 │ 2329853 │ 2329853 │ 2329853 │ │ + │ │ │ 2329495 │ 2329495 │ 2329495 │ 2329495 │ │ │ │ │ 166.3 MB │ 166.3 MB │ 166.3 MB │ 166.3 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 418449 │ 418449 │ 418449 │ 418449 │ │ - │ │ │ 53.36 MB │ 53.36 MB │ 53.36 MB │ 53.36 MB │ │ - │ │ ╰─ room_with_a_view 320 ms │ 324.7 ms │ 321.5 ms │ 321.5 ms │ 100 │ 100 - │ │ 943.3 KB/s │ 929.6 KB/s │ 939 KB/s │ 938.8 KB/s │ │ + │ │ │ 418382 │ 418382 │ 418382 │ 418382 │ │ + │ │ │ 53.35 MB │ 53.35 MB │ 53.35 MB │ 53.35 MB │ │ + │ │ ╰─ room_with_a_view 322 ms │ 326.7 ms │ 323.5 ms │ 323.8 ms │ 100 │ 100 + │ │ 937.5 KB/s │ 924 KB/s │ 932.9 KB/s │ 932.2 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 3573120 │ 3573120 │ 3573120 │ 3573120 │ │ + │ │ 3572528 │ 3572528 │ 3572528 │ 3572528 │ │ │ │ 169.7 MB │ 169.7 MB │ 169.7 MB │ 169.7 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 3884628 │ 3884628 │ 3884628 │ 3884628 │ │ - │ │ 299.2 MB │ 299.2 MB │ 299.2 MB │ 299.2 MB │ │ + │ │ 3884036 │ 3884036 │ 3884036 │ 3884036 │ │ + │ │ 299.1 MB │ 299.1 MB │ 299.1 MB │ 299.1 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 874505 │ 874505 │ 874505 │ 874505 │ │ + │ │ 874353 │ 874353 │ 874353 │ 874353 │ │ │ │ 110.8 MB │ 110.8 MB │ 110.8 MB │ 110.8 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 81.97 ms │ 84.55 ms │ 82.41 ms │ 82.43 ms │ 100 │ 100 - │ │ 1.995 MB/s │ 1.935 MB/s │ 1.985 MB/s │ 1.984 MB/s │ │ + │ ├─ romeo_and_juliet 83.13 ms │ 85.75 ms │ 84.12 ms │ 84.1 ms │ 100 │ 100 + │ │ 1.968 MB/s │ 1.907 MB/s │ 1.944 MB/s │ 1.945 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 914680 │ 914680 │ 914680 │ 914680 │ │ + │ │ 914586 │ 914586 │ 914586 │ 914586 │ │ │ │ 42.85 MB │ 42.85 MB │ 42.85 MB │ 42.85 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 1226188 │ 1226188 │ 1226188 │ 1226188 │ │ - │ │ 85.71 MB │ 85.71 MB │ 85.71 MB │ 85.71 MB │ │ + │ │ 1226094 │ 1226094 │ 1226094 │ 1226094 │ │ + │ │ 85.7 MB │ 85.7 MB │ 85.7 MB │ 85.7 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 187705 │ 187705 │ 187705 │ 187705 │ │ + │ │ 187680 │ 187680 │ 187680 │ 187680 │ │ │ │ 24.37 MB │ 24.37 MB │ 24.37 MB │ 24.37 MB │ │ - │ ╰─ room_with_a_view 112.1 ms │ 115.5 ms │ 112.8 ms │ 112.9 ms │ 100 │ 100 - │ 2.692 MB/s │ 2.611 MB/s │ 2.675 MB/s │ 2.673 MB/s │ │ + │ ╰─ room_with_a_view 113.8 ms │ 117 ms │ 114.7 ms │ 114.8 ms │ 100 │ 100 + │ 2.652 MB/s │ 2.578 MB/s │ 2.631 MB/s │ 2.628 MB/s │ │ │ alloc: │ │ │ │ │ - │ 1232442 │ 1232442 │ 1232442 │ 1232442 │ │ + │ 1232390 │ 1232390 │ 1232390 │ 1232390 │ │ │ 58.6 MB │ 58.6 MB │ 58.6 MB │ 58.6 MB │ │ │ dealloc: │ │ │ │ │ - │ 1543950 │ 1543950 │ 1543950 │ 1543950 │ │ + │ 1543898 │ 1543898 │ 1543898 │ 1543898 │ │ │ 115.4 MB │ 115.4 MB │ 115.4 MB │ 115.4 MB │ │ │ grow: │ │ │ │ │ - │ 300737 │ 300737 │ 300737 │ 300737 │ │ + │ 300721 │ 300721 │ 300721 │ 300721 │ │ │ 38.19 MB │ 38.19 MB │ 38.19 MB │ 38.19 MB │ │ ╰─ tokenizers │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.407 s │ 1.457 s │ 1.411 s │ 1.413 s │ 100 │ 100 - │ │ 116.2 KB/s │ 112.2 KB/s │ 115.9 KB/s │ 115.7 KB/s │ │ + │ ├─ romeo_and_juliet 1.393 s │ 1.525 s │ 1.404 s │ 1.409 s │ 100 │ 100 + │ │ 117.4 KB/s │ 107.2 KB/s │ 116.5 KB/s │ 116 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 29188727 │ 29188727 │ 29188727 │ 29188727 │ │ - │ │ 3.601 GB │ 3.601 GB │ 3.601 GB │ 3.601 GB │ │ + │ │ 29070934 │ 29070934 │ 29070934 │ 29070934 │ │ + │ │ 3.591 GB │ 3.591 GB │ 3.591 GB │ 3.591 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 29246780 │ 29246780 │ 29246780 │ 29246780 │ │ - │ │ 5.214 GB │ 5.214 GB │ 5.214 GB │ 5.214 GB │ │ + │ │ 29128987 │ 29128987 │ 29128987 │ 29128987 │ │ + │ │ 5.201 GB │ 5.201 GB │ 5.201 GB │ 5.201 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 463030 │ 463030 │ 463030 │ 463030 │ │ - │ │ 1.608 GB │ 1.608 GB │ 1.608 GB │ 1.608 GB │ │ - │ ╰─ room_with_a_view 1.944 s │ 2.176 s │ 1.952 s │ 1.965 s │ 100 │ 100 - │ 155.2 KB/s │ 138.6 KB/s │ 154.6 KB/s │ 153.5 KB/s │ │ + │ │ 442951 │ 442951 │ 442951 │ 442951 │ │ + │ │ 1.605 GB │ 1.605 GB │ 1.605 GB │ 1.605 GB │ │ + │ ╰─ room_with_a_view 1.94 s │ 2.063 s │ 1.952 s │ 1.953 s │ 100 │ 100 + │ 155.5 KB/s │ 146.3 KB/s │ 154.6 KB/s │ 154.5 KB/s │ │ │ alloc: │ │ │ │ │ - │ 39390415 │ 39390415 │ 39390415 │ 39390415 │ │ - │ 5.158 GB │ 5.158 GB │ 5.158 GB │ 5.158 GB │ │ + │ 39268765 │ 39268765 │ 39268765 │ 39268765 │ │ + │ 5.144 GB │ 5.144 GB │ 5.144 GB │ 5.144 GB │ │ │ dealloc: │ │ │ │ │ - │ 39448468 │ 39448468 │ 39448468 │ 39448468 │ │ - │ 7.421 GB │ 7.421 GB │ 7.421 GB │ 7.421 GB │ │ + │ 39326818 │ 39326818 │ 39326818 │ 39326818 │ │ + │ 7.402 GB │ 7.402 GB │ 7.402 GB │ 7.402 GB │ │ │ grow: │ │ │ │ │ - │ 681203 │ 681203 │ 681203 │ 681203 │ │ - │ 2.257 GB │ 2.257 GB │ 2.257 GB │ 2.257 GB │ │ + │ 663372 │ 663372 │ 663372 │ 663372 │ │ + │ 2.252 GB │ 2.252 GB │ 2.252 GB │ 2.252 GB │ │ ├─ 512 │ │ │ │ │ - │ ├─ romeo_and_juliet 439.6 ms │ 478.3 ms │ 441.5 ms │ 443.1 ms │ 100 │ 100 - │ │ 372.1 KB/s │ 342 KB/s │ 370.5 KB/s │ 369.2 KB/s │ │ + │ ├─ romeo_and_juliet 437.1 ms │ 456.4 ms │ 440.9 ms │ 440.9 ms │ 100 │ 100 + │ │ 374.2 KB/s │ 358.4 KB/s │ 371 KB/s │ 371 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 9331640 │ 9331640 │ 9331640 │ 9331640 │ │ - │ │ 1.159 GB │ 1.159 GB │ 1.159 GB │ 1.159 GB │ │ + │ │ 9319799 │ 9319799 │ 9319799 │ 9319799 │ │ + │ │ 1.158 GB │ 1.158 GB │ 1.158 GB │ 1.158 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 9389693 │ 9389693 │ 9389693 │ 9389693 │ │ - │ │ 1.678 GB │ 1.678 GB │ 1.678 GB │ 1.678 GB │ │ + │ │ 9377852 │ 9377852 │ 9377852 │ 9377852 │ │ + │ │ 1.676 GB │ 1.676 GB │ 1.676 GB │ 1.676 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 100057 │ 100057 │ 100057 │ 100057 │ │ - │ │ 513.7 MB │ 513.7 MB │ 513.7 MB │ 513.7 MB │ │ - │ ╰─ room_with_a_view 800.8 ms │ 812.7 ms │ 803.8 ms │ 804 ms │ 100 │ 100 - │ 377 KB/s │ 371.4 KB/s │ 375.5 KB/s │ 375.5 KB/s │ │ + │ │ 98200 │ 98200 │ 98200 │ 98200 │ │ + │ │ 513.5 MB │ 513.5 MB │ 513.5 MB │ 513.5 MB │ │ + │ ╰─ room_with_a_view 798.8 ms │ 808.1 ms │ 802.7 ms │ 802.8 ms │ 100 │ 100 + │ 377.9 KB/s │ 373.5 KB/s │ 376.1 KB/s │ 376 KB/s │ │ │ alloc: │ │ │ │ │ - │ 16335239 │ 16335239 │ 16335239 │ 16335239 │ │ - │ 2.154 GB │ 2.154 GB │ 2.154 GB │ 2.154 GB │ │ + │ 16325652 │ 16325652 │ 16325652 │ 16325652 │ │ + │ 2.153 GB │ 2.153 GB │ 2.153 GB │ 2.153 GB │ │ │ dealloc: │ │ │ │ │ - │ 16393292 │ 16393292 │ 16393292 │ 16393292 │ │ - │ 3.105 GB │ 3.105 GB │ 3.105 GB │ 3.105 GB │ │ + │ 16383705 │ 16383705 │ 16383705 │ 16383705 │ │ + │ 3.104 GB │ 3.104 GB │ 3.104 GB │ 3.104 GB │ │ │ grow: │ │ │ │ │ - │ 165453 │ 165453 │ 165453 │ 165453 │ │ - │ 945.8 MB │ 945.8 MB │ 945.8 MB │ 945.8 MB │ │ + │ 163919 │ 163919 │ 163919 │ 163919 │ │ + │ 945.4 MB │ 945.4 MB │ 945.4 MB │ 945.4 MB │ │ ├─ 4096 │ │ │ │ │ - │ ├─ romeo_and_juliet 299.2 ms │ 305.3 ms │ 300.4 ms │ 300.6 ms │ 100 │ 100 - │ │ 546.7 KB/s │ 535.8 KB/s │ 544.5 KB/s │ 544.2 KB/s │ │ + │ ├─ romeo_and_juliet 298.5 ms │ 301.9 ms │ 299.6 ms │ 299.6 ms │ 100 │ 100 + │ │ 548.1 KB/s │ 541.9 KB/s │ 546 KB/s │ 546 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 6432739 │ 6432739 │ 6432739 │ 6432739 │ │ - │ │ 802.5 MB │ 802.5 MB │ 802.5 MB │ 802.5 MB │ │ + │ │ 6430819 │ 6430819 │ 6430819 │ 6430819 │ │ + │ │ 802.3 MB │ 802.3 MB │ 802.3 MB │ 802.3 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 6490792 │ 6490792 │ 6490792 │ 6490792 │ │ + │ │ 6488872 │ 6488872 │ 6488872 │ 6488872 │ │ │ │ 1.164 GB │ 1.164 GB │ 1.164 GB │ 1.164 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 34248 │ 34248 │ 34248 │ 34248 │ │ + │ │ 33980 │ 33980 │ 33980 │ 33980 │ │ │ │ 356.8 MB │ 356.8 MB │ 356.8 MB │ 356.8 MB │ │ - │ ╰─ room_with_a_view 566.8 ms │ 656.5 ms │ 569.7 ms │ 571.6 ms │ 100 │ 100 - │ 532.6 KB/s │ 459.8 KB/s │ 529.8 KB/s │ 528.1 KB/s │ │ + │ ╰─ room_with_a_view 565.2 ms │ 593.8 ms │ 568.9 ms │ 570 ms │ 100 │ 100 + │ 534.1 KB/s │ 508.3 KB/s │ 530.6 KB/s │ 529.6 KB/s │ │ │ alloc: │ │ │ │ │ - │ 11604278 │ 11604278 │ 11604278 │ 11604278 │ │ - │ 1.536 GB │ 1.536 GB │ 1.536 GB │ 1.536 GB │ │ + │ 11601946 │ 11601946 │ 11601946 │ 11601946 │ │ + │ 1.535 GB │ 1.535 GB │ 1.535 GB │ 1.535 GB │ │ │ dealloc: │ │ │ │ │ - │ 11662331 │ 11662331 │ 11662331 │ 11662331 │ │ + │ 11659999 │ 11659999 │ 11659999 │ 11659999 │ │ │ 2.215 GB │ 2.215 GB │ 2.215 GB │ 2.215 GB │ │ │ grow: │ │ │ │ │ - │ 55752 │ 55752 │ 55752 │ 55752 │ │ + │ 55342 │ 55342 │ 55342 │ 55342 │ │ │ 674.3 MB │ 674.3 MB │ 674.3 MB │ 674.3 MB │ │ ╰─ 32768 │ │ │ │ │ - ├─ romeo_and_juliet 133 ms │ 135.5 ms │ 133.6 ms │ 133.7 ms │ 100 │ 100 - │ 1.23 MB/s │ 1.207 MB/s │ 1.224 MB/s │ 1.223 MB/s │ │ + ├─ romeo_and_juliet 132.5 ms │ 135.2 ms │ 133 ms │ 133.1 ms │ 100 │ 100 + │ 1.234 MB/s │ 1.209 MB/s │ 1.23 MB/s │ 1.228 MB/s │ │ │ alloc: │ │ │ │ │ - │ 2845251 │ 2845251 │ 2845251 │ 2845251 │ │ + │ 2844814 │ 2844814 │ 2844814 │ 2844814 │ │ │ 353.7 MB │ 353.7 MB │ 353.7 MB │ 353.7 MB │ │ │ dealloc: │ │ │ │ │ - │ 2903304 │ 2903304 │ 2903304 │ 2903304 │ │ - │ 517.5 MB │ 517.5 MB │ 517.5 MB │ 517.5 MB │ │ + │ 2902867 │ 2902867 │ 2902867 │ 2902867 │ │ + │ 517.4 MB │ 517.4 MB │ 517.4 MB │ 517.4 MB │ │ │ grow: │ │ │ │ │ - │ 9648 │ 9648 │ 9648 │ 9648 │ │ + │ 9579 │ 9579 │ 9579 │ 9579 │ │ │ 158.8 MB │ 158.8 MB │ 158.8 MB │ 158.8 MB │ │ - ╰─ room_with_a_view 219.7 ms │ 223.5 ms │ 220.2 ms │ 220.4 ms │ 100 │ 100 - 1.373 MB/s │ 1.35 MB/s │ 1.37 MB/s │ 1.369 MB/s │ │ + ╰─ room_with_a_view 220.5 ms │ 223.2 ms │ 221.3 ms │ 221.4 ms │ 100 │ 100 + 1.369 MB/s │ 1.352 MB/s │ 1.363 MB/s │ 1.363 MB/s │ │ alloc: │ │ │ │ │ - 4490083 │ 4490083 │ 4490083 │ 4490083 │ │ + 4489764 │ 4489764 │ 4489764 │ 4489764 │ │ 594.3 MB │ 594.3 MB │ 594.3 MB │ 594.3 MB │ │ dealloc: │ │ │ │ │ - 4548136 │ 4548136 │ 4548136 │ 4548136 │ │ - 860.9 MB │ 860.9 MB │ 860.9 MB │ 860.9 MB │ │ + 4547817 │ 4547817 │ 4547817 │ 4547817 │ │ + 860.8 MB │ 860.8 MB │ 860.8 MB │ 860.8 MB │ │ grow: │ │ │ │ │ - 14224 │ 14224 │ 14224 │ 14224 │ │ + 14170 │ 14170 │ 14170 │ 14170 │ │ 261.5 MB │ 261.5 MB │ 261.5 MB │ 261.5 MB │ │ diff --git a/src/lib.rs b/src/lib.rs index be2a517..da96d16 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -319,6 +319,9 @@ trait SemanticSplit { let diff = chunk.len() - chunk.trim_start().len(); (offset + diff, chunk.trim()) } + + /// Allows the impl to clear out unnecessary data after the cursor has moved. + fn update_ranges(&mut self, _cursor: usize); } /// Returns chunks of text with their byte offsets as an iterator. @@ -377,6 +380,7 @@ where fn next_chunk(&mut self) -> Option<(usize, &'text str)> { // Reset caches so we can reuse the memory allocation self.chunk_sizer.clear_cache(); + self.semantic_split.update_ranges(self.cursor); self.update_next_sections(); let start = self.cursor; @@ -485,12 +489,22 @@ where let remaining_text = self.text.get(self.cursor..).unwrap(); - for (level, str) in levels_in_remaining_text.filter_map(|level| { - self.semantic_split - .semantic_chunks(self.cursor, remaining_text, level) - .next() - .map(|(_, str)| (level, str)) - }) { + let levels_with_chunks = levels_in_remaining_text + .filter_map(|level| { + self.semantic_split + .semantic_chunks(self.cursor, remaining_text, level) + .next() + .map(|(_, str)| (level, str)) + }) + // We assume that larger levels are also longer. We can skip lower levels if going to a higher level would result in a shorter text + .coalesce(|(a_level, a_str), (b_level, b_str)| { + if a_str.len() >= b_str.len() { + Ok((b_level, b_str)) + } else { + Err(((a_level, a_str), (b_level, b_str))) + } + }); + for (level, str) in levels_with_chunks { let chunk_size = self .chunk_sizer .check_capacity(self.trim_chunk(self.cursor, str)); diff --git a/src/markdown.rs b/src/markdown.rs index f4fa376..27061e8 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -195,8 +195,6 @@ enum SemanticLevel { Sentence, /// Single line break, which isn't necessarily a new element in Markdown SoftBreak, - /// A text node within an element - Text, /// An inline element that is within a larger element such as a paragraph, but /// more specific than a sentence. InlineElement(SemanticSplitPosition), @@ -222,7 +220,6 @@ impl SemanticLevel { | SemanticLevel::Word | SemanticLevel::Sentence | SemanticLevel::SoftBreak - | SemanticLevel::Text | SemanticLevel::Block | SemanticLevel::MetaContainer | SemanticLevel::Rule @@ -240,7 +237,6 @@ impl SemanticLevel { | SemanticLevel::Word | SemanticLevel::Sentence | SemanticLevel::SoftBreak - | SemanticLevel::Text | SemanticLevel::InlineElement(_) | SemanticLevel::Rule | SemanticLevel::Heading(_) @@ -363,13 +359,13 @@ impl SemanticSplit for Markdown { | Tag::Image { .. } | Tag::TableCell, ) + | Event::Text(_) | Event::HardBreak | Event::Code(_) | Event::InlineHtml(_) => Some(( SemanticLevel::InlineElement(SemanticSplitPosition::Own), range, )), - Event::Text(_) => Some((SemanticLevel::Text, range)), Event::FootnoteReference(_) => Some(( SemanticLevel::InlineElement(SemanticSplitPosition::Prev), range, @@ -442,8 +438,7 @@ impl SemanticSplit for Markdown { SemanticLevel::Sentence => text .split_sentence_bound_indices() .map(move |(i, str)| (offset + i, str)), - SemanticLevel::Text - | SemanticLevel::SoftBreak + SemanticLevel::SoftBreak | SemanticLevel::InlineElement(_) | SemanticLevel::ContainerBlock(_) | SemanticLevel::Block @@ -473,6 +468,11 @@ impl SemanticSplit for Markdown { (offset + diff, chunk.trim()) } } + + /// Clear out ranges we have moved past so future iterations are faster + fn update_ranges(&mut self, cursor: usize) { + self.ranges.retain(|(_, range)| range.start >= cursor); + } } #[cfg(test)] @@ -582,7 +582,7 @@ mod tests { #[test] fn chunk_by_words() { - let text = "The quick (\"brown\") fox can't jump 32.3 feet, right?"; + let text = "The quick brown fox can jump 32.3 feet, right?"; let chunks = TextChunks::<_, _, Markdown>::new(10, &Characters, text, false) .map(|(_, w)| w) @@ -590,11 +590,10 @@ mod tests { assert_eq!( vec![ "The quick ", - "(\"brown\") ", - "fox can't ", - "jump 32.3 ", - "feet, ", - "right?" + "brown fox ", + "can jump ", + "32.3 feet,", + " right?" ], chunks ); @@ -659,7 +658,10 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..41), - &(SemanticLevel::Text, 0..41) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..41 + ) ], markdown.ranges_after_offset(0).collect::>() ); @@ -680,7 +682,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Next), 2..5 ), - &(SemanticLevel::Text, 6..21), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 6..21 + ), &( SemanticLevel::ContainerBlock(SemanticSplitPosition::Own), 22..42 @@ -689,7 +694,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Next), 24..27 ), - &(SemanticLevel::Text, 28..42), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 28..42 + ), ], markdown.ranges_after_offset(0).collect::>() ); @@ -702,7 +710,10 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..12), - &(SemanticLevel::Text, 0..8), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..8 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Prev), 8..12 @@ -739,7 +750,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..10 ), - &(SemanticLevel::Text, 1..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 1..9 + ), ], markdown.ranges_after_offset(0).collect::>() ); @@ -756,7 +770,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..12 ), - &(SemanticLevel::Text, 2..10), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..10 + ), ], markdown.ranges_after_offset(0).collect::>() ); @@ -773,7 +790,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..12 ), - &(SemanticLevel::Text, 2..10), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..10 + ), ], markdown.ranges_after_offset(0).collect::>() ); @@ -790,7 +810,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..11 ), - &(SemanticLevel::Text, 1..5), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 1..5 + ), ], markdown.ranges_after_offset(0).collect::>() ); @@ -807,7 +830,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..12 ), - &(SemanticLevel::Text, 2..6), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..6 + ), ], markdown.ranges_after_offset(0).collect::>() ); @@ -824,7 +850,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..6 ), - &(SemanticLevel::Text, 6..15), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 6..15 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 15..22 @@ -864,12 +893,18 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 1..11 ), - &(SemanticLevel::Text, 2..10), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..10 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 12..22 ), - &(SemanticLevel::Text, 13..21), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 13..21 + ), &( SemanticLevel::ContainerBlock(SemanticSplitPosition::Own), 38..57 @@ -878,12 +913,18 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 39..47 ), - &(SemanticLevel::Text, 40..46), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 40..46 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 48..56 ), - &(SemanticLevel::Text, 49..55) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 49..55 + ) ], markdown.ranges_after_offset(0).collect::>() ); @@ -896,9 +937,15 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..26), - &(SemanticLevel::Text, 0..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..9 + ), &(SemanticLevel::SoftBreak, 9..10), - &(SemanticLevel::Text, 10..26) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 10..26 + ) ], markdown.ranges_after_offset(0).collect::>() ); @@ -911,12 +958,18 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..27), - &(SemanticLevel::Text, 0..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..9 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 9..11 ), - &(SemanticLevel::Text, 11..27) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 11..27 + ) ], markdown.ranges_after_offset(0).collect::>() ); @@ -930,7 +983,10 @@ mod tests { vec![ &(SemanticLevel::Block, 0..18), &(SemanticLevel::Block, 10..18), - &(SemanticLevel::Text, 10..18) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 10..18 + ) ], markdown.ranges_after_offset(0).collect::>() ); @@ -941,7 +997,13 @@ mod tests { let markdown = Markdown::new("```\ncode\n```"); assert_eq!( - vec![&(SemanticLevel::Block, 0..12), &(SemanticLevel::Text, 4..9)], + vec![ + &(SemanticLevel::Block, 0..12), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 4..9 + ) + ], markdown.ranges_after_offset(0).collect::>() ); } @@ -957,7 +1019,10 @@ mod tests { 0..7 ), &(SemanticLevel::Block, 2..7), - &(SemanticLevel::Text, 2..7) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..7 + ) ], markdown.ranges_after_offset(0).collect::>() ); @@ -970,10 +1035,16 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..10), - &(SemanticLevel::Text, 0..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..9 + ), &(SemanticLevel::Rule, 11..15), &(SemanticLevel::Block, 16..27), - &(SemanticLevel::Text, 16..27) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 16..27 + ) ], markdown.ranges_after_offset(0).collect::>() ); @@ -997,7 +1068,10 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Heading(level), 0..9 + index), - &(SemanticLevel::Text, 2 + index..9 + index) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2 + index..9 + index + ) ], markdown.ranges_after_offset(0).collect::>() ); diff --git a/src/text.rs b/src/text.rs index 14cc1e5..19bca47 100644 --- a/src/text.rs +++ b/src/text.rs @@ -295,6 +295,11 @@ impl SemanticSplit for LineBreaks { .map(move |(i, str)| (offset + i, str)), } } + + /// Clear out ranges we have moved past so future iterations are faster + fn update_ranges(&mut self, cursor: usize) { + self.line_breaks.retain(|(_, range)| range.start >= cursor); + } } #[cfg(test)] diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap index 3597de6..e595aa0 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap @@ -3,8 +3,9 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/commonmark_spec.md --- -- "---\ntitle: CommonMark Spec\n" -- "author: John MacFarlane\nversion: '0.31.2'\n" +- "---\n" +- "title: CommonMark Spec\nauthor: John MacFarlane\n" +- "version: '0.31.2'\n" - "date: '2024-01-28'\n" - "license: '[CC-BY-SA 4.0](https" - "://creativecommons.org/licenses/by-sa/" @@ -17,8 +18,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "help from Aaron Swartz) and released in 2004 in the form of a\n" - "[syntax description](https://daringfireball.net/projects" - "/markdown/syntax)\nand a Perl script (" -- "`Markdown.pl`" -- ") for converting Markdown to\n" +- "`Markdown.pl`) for converting Markdown to\n" - "HTML. In the next decade, dozens of implementations were\n" - "developed in many languages. Some extended the original\n" - "Markdown syntax with conventions for footnotes, tables, and\n" @@ -30,9 +30,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "articles, slide shows, letters, and lecture notes.\n\n" - "What distinguishes Markdown from many other lightweight markup\n" - "syntaxes, which are often easier to write, is its readability.\n" -- "As Gruber writes:" -- "\n\n> The overriding design goal for Markdown's formatting syntax is" -- "\n> to make it as readable as possible. The idea is that a\n> " +- "As Gruber writes:\n\n" +- "> " +- "The overriding design goal for Markdown's formatting syntax is\n> " +- "to make it as readable as possible. The idea is that a\n> " - "Markdown-formatted document should be publishable as-is, as\n> " - "plain text, without looking like it's been marked up with tags\n> " - "or formatting instructions.\n> (" @@ -43,7 +44,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "asciidoc.org/) with\n" - "an equivalent sample of Markdown. Here is a sample of\n" - "AsciiDoc from the AsciiDoc manual:\n\n" -- "```\n1. List item one.\n+\n" +- "```\n" +- "1. List item one.\n+\n" - "List item one continued with a second paragraph followed by an\nIndented block.\n" - "+\n" - "................" @@ -57,7 +59,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This paragraph is part of the preceding list item.\n\nb. List item b.\n\n" - "This paragraph belongs to item two of the outer list.\n--\n```\n\n" - "And here is the equivalent in Markdown:\n" -- "```\n1. List item one.\n\n" +- "```\n" +- "1. List item one.\n\n" - " List item one continued with a second paragraph followed by an\n Indented block.\n\n" - " $ ls *.sh\n" - " $ mv *.sh ~/tmp\n\n" @@ -78,23 +81,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "](https://daringfireball.net/projects/markdown" - "/syntax)\n" - "does not specify the syntax unambiguously. Here are some examples of\n" -- "questions it does not answer:" -- "\n\n1. " +- "questions it does not answer:\n\n" +- "1. " - "How much indentation is needed for a sublist? " - "The spec says that\n " - "continuation paragraphs need to be indented four spaces, but is\n " - "not fully explicit about sublists. It is natural to think that\n " -- "they, too, must be indented four spaces, but `" -- "Markdown.pl` does\n not require that. This is hardly a \"" -- "corner case,\" and divergences\n " +- "they, too, must be indented four spaces, but " +- "`Markdown.pl` does\n not require that. This is hardly a " +- "\"corner case,\" and divergences\n " - "between implementations on this issue often lead to surprises for\n " - "users in real documents. (See " - "[this comment by John\n Gruber" - "](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ -- gmane.text.markdown.general/1997).) -- "\n\n2. Is a blank line needed before a block quote or heading?\n " +- "gmane.text.markdown.general/1997).)\n\n" +- "2. " +- "Is a blank line needed before a block quote or heading?\n " - "Most implementations do not require the blank line. However,\n " - "this can lead to unexpected results in hard-wrapped text, and\n " - "also to ambiguities in parsing (note that some implementations\n " @@ -104,17 +108,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ -- gmane.text.markdown.general/2146).) -- "\n\n3. " +- "gmane.text.markdown.general/2146).)\n\n" +- "3. " - "Is a blank line needed before an indented code block?\n (" -- "`Markdown.pl` requires it, but this is not mentioned in " -- "the\n documentation, and some implementations do not require it.)\n\n " +- "`Markdown.pl`" +- " requires it, but this is not mentioned in the\n " +- "documentation, and some implementations do not require it.)\n\n " - "``` markdown\n paragraph\n code?\n ```\n\n" - "4. " - "What is the exact rule for determining when list items get\n wrapped in " -- "`

    `" -- " tags? Can a list be partially \"loose\" and partially\n \"tight\"" -- "? What should we do with a list like this?\n\n " +- "`

    ` tags? Can a list be partially \"loose\"" +- " and partially\n \"tight\"? What should we do with a list like this?\n\n " - "``` markdown\n 1. one\n\n 2. two\n 3. three\n" - " ```\n\n Or this?\n" - "\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n" @@ -124,52 +128,57 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "20170611172104/http://" - article.gmane.org/ - "gmane.text.markdown.general/2554).)\n\n" -- "5. Can list markers be indented? " +- "5. " +- "Can list markers be indented? " - "Can ordered list markers be right-aligned?\n\n " -- "``` markdown\n 8. item 1\n 9. item 2\n" -- " 10. item 2a\n ```\n\n" -- "6. Is this one list with a thematic break in its second item,\n " +- "``` markdown\n 8. item 1\n 9. item 2\n " +- "10. item 2a\n ```\n\n" +- "6. " +- "Is this one list with a thematic break in its second item,\n " - "or two lists separated by a thematic break?\n\n " - "``` markdown\n * a\n * * * * *\n * b\n" - " ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n " +- "7. " +- "When list markers change from numbers to bullets, do we have\n " - "two lists or one? (The Markdown syntax description suggests two,\n " - "but the perl scripts and many other implementations produce one.)\n\n " -- "``` markdown\n 1. fee\n 2. fie\n - foe\n" -- " - fum\n ```\n\n" -- "8. What are the precedence rules for the markers of inline structure?\n " +- "``` markdown\n 1. fee\n 2. fie\n - foe\n " +- "- fum\n ```\n\n" +- "8. " +- "What are the precedence rules for the markers of inline structure?\n " - "For example, is the following a valid link, or does the code span\n " - "take precedence ?\n\n " -- "``` markdown\n" -- " [a backtick (`)](/url) and [another " +- "``` markdown\n " +- "[a backtick (`)](/url) and [another " - "backtick (`)](/url).\n ```\n\n" -- "9. What are the precedence rules for markers of emphasis and strong\n " +- "9. " +- "What are the precedence rules for markers of emphasis and strong\n " - "emphasis? For example, how should the following be parsed?\n\n " - "``` markdown\n *foo *bar* baz*\n" - " ```\n\n" -- "10. What are the precedence rules between block-level and inline-level\n " +- "10. " +- "What are the precedence rules between block-level and inline-level\n " - "structure? For example, how should the following be parsed?\n\n " -- "``` markdown\n" -- " - `a long code span can contain a hyphen like this\n " +- "``` markdown\n " +- "- `a long code span can contain a hyphen like this\n " - " - and it can screw things up`\n ```\n\n" - "11. " - "Can list items include section headings? (`Markdown.pl`" - " does not\n " - "allow this, but does allow blockquotes to include headings.)\n\n " - "``` markdown\n - # Heading\n ```\n\n" -- "12. Can list items be empty?\n\n ``` markdown\n * a\n *\n" -- " * b\n ```\n\n" -- "13. Can link references be defined inside block quotes or list items?\n\n" -- " ``` markdown\n > Blockquote [foo].\n >\n" -- " > [foo]: /url\n ```\n\n" -- "14. If there are multiple definitions for the same reference, which takes\n precedence?" -- "\n\n ``` markdown\n [foo]: /url1\n" -- " [foo]: /url2\n\n [foo][]\n" +- "12. Can list items be empty?\n" +- "\n ``` markdown\n * a\n *\n * b\n ```\n\n" +- "13. Can link references be defined inside block quotes or list items?\n" +- "\n ``` markdown\n > Blockquote [foo].\n >\n " +- "> [foo]: /url\n ```\n\n" +- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n" +- "\n ``` markdown\n [foo]: /url1\n " +- "[foo]: /url2\n\n [foo][]\n" - " ```\n\n" -- "In the absence of a spec, early implementers consulted `" -- "Markdown.pl`\nto resolve these ambiguities. But " -- "`Markdown.pl`" -- " was quite buggy, and\n" +- "In the absence of a spec, early implementers consulted " +- "`Markdown.pl`\nto resolve these ambiguities. But " +- "`Markdown.pl` was quite buggy, and\n" - "gave manifestly bad results in many cases, so it was not a\n" - "satisfactory replacement for a spec.\n\n" - "Because there is no unambiguous spec, implementations have diverged\n" @@ -178,14 +187,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "GitHub wiki)\n" - "renders differently on another (say, converting to docbook using\n" - "pandoc). To make matters worse, because nothing in Markdown counts\n" -- "as a \"syntax error,\" the divergence often isn't discovered right " -- "away.\n\n" +- "as a \"syntax error,\" the divergence often isn'" +- "t discovered right away.\n\n" - "## About this document\n\n" - "This document attempts to specify Markdown syntax unambiguously.\n" - "It contains many examples with side-by-side Markdown and\n" - "HTML. These are intended to double as conformance tests. An\n" -- "accompanying script `spec_tests.py` can be used to run " -- "the tests\nagainst any Markdown program:\n\n " +- "accompanying script `spec_tests.py`" +- " can be used to run the tests\nagainst any Markdown program:\n\n " - "python test/spec_tests.py --spec " - "spec.txt --program PROGRAM\n\n" - "Since this document describes how Markdown is to be parsed into\n" @@ -203,14 +212,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the expectations of the spec examples (percent-encoding\n" - "non-ASCII characters in URLs). " - "But a conforming implementation\ncan use a different renderer and may choose not to\n" -- percent-encode non-ASCII characters in URLs. -- "\n\nThis document is generated from a text file, " -- "`spec.txt`" -- ", written\n" +- "percent-encode non-ASCII characters in URLs.\n\n" +- "This document is generated from a text file, " +- "`spec.txt`, written\n" - "in Markdown with a small extension for the side-by-side tests.\n" - "The script `tools/makespec.py` can be used to convert " -- "`spec.txt`" -- " into\n" +- "`spec.txt` into\n" - HTML or CommonMark (which can then be converted into other formats - ").\n\n" - "In the examples, the `→` character is used to represent tabs.\n\n" @@ -227,24 +234,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to a certain encoding.\n\n" - "A [line](@) is a sequence of zero or more [characters" - "]\nother than line feed (`U+000A`" -- ") or carriage return (`U+000D`" -- "),\nfollowed by a [line ending] or by the end of file." -- "\n\nA [line ending](@) is a line feed (" +- ") or carriage return (`U+000D`),\nfollowed by a " +- "[line ending] or by the end of file.\n\n" +- "A [line ending](@) is a line feed (" - "`U+000A`), a carriage return\n(" -- "`U+000D`) not followed by a line feed, or a " -- "carriage return and a\nfollowing line feed." -- "\n\nA line containing no characters, or a line containing only spaces\n(" -- "`U+0020`) or tabs (`U+0009" -- "`), is called a [blank line](@).\n\n" +- "`U+000D`" +- ") not followed by a line feed, or a carriage return and a\n" +- "following line feed.\n\n" +- "A line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (" +- "`U+0009`), is called a " +- "[blank line](@).\n\n" - "The following definitions of character classes will be used in this spec:\n" - "\nA [Unicode whitespace character](@)" -- " is a character in the Unicode `Zs`" -- " general\ncategory, or a tab (`U+0009`" +- " is a character in the Unicode `Zs` general\n" +- "category, or a tab (`U+0009`" - "), line feed (`U+000A`), form feed (" - "`U+000C`), or\ncarriage return (" - "`U+000D`).\n\n" -- "[Unicode whitespace](@) is a sequence of one or " -- "more\n[Unicode whitespace characters].\n\n" +- "[Unicode whitespace](@)" +- " is a sequence of one or more\n[Unicode whitespace characters].\n\n" - "A [tab](@) is `U+0009`.\n" - "\nA [space](@) is `U+0020`.\n" - "\nAn [ASCII control character](@) is a character between " @@ -261,11 +270,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " (U+003A–0040),\n`[`, " - "`\\`, `]`, `^`, `_`, " - "`` ` `` (U+005B–0060), \n" -- "`{`, `|`, `}`, or `~` " -- "(U+007B–007E).\n\n" -- "A [Unicode punctuation character](@) is a " -- "character in the Unicode `P`\n(puncuation) or " -- "`S` (symbol) general categories.\n\n" +- "`{`, `|`, `}`, or `~`" +- " (U+007B–007E).\n\n" +- "A [Unicode punctuation character](@)" +- " is a character in the Unicode `P`\n" +- "(puncuation) or `S` (symbol) general categories.\n\n" - "## Tabs\n\n" - "Tabs in lines are not expanded to [spaces]. However,\n" - "in contexts where spaces help to define block structure,\n" @@ -318,15 +327,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n" -- "Normally the `>`" -- " that begins a block quote may be followed\n" +- "Normally the `>` that begins a block quote may be followed\n" - "optionally by a space, which is not considered part of the\n" -- "content. In the following case `>`" -- " is followed by a tab,\n" +- "content. In the following case `>` is followed by a tab,\n" - "which is treated as if it were expanded into three spaces.\n" - "Since one of these spaces is considered part of the\ndelimiter, " -- "`foo`" -- " is considered to be indented six spaces\n" +- "`foo` is considered to be indented six spaces\n" - "inside the block quote context, so we get an indented\n" - "code block starting with two spaces.\n\n" - "````````````````" @@ -374,8 +380,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Insecure characters\n\n" -- "For security reasons, the Unicode character `U+0000` must " -- "be replaced\nwith the REPLACEMENT CHARACTER (" +- "For security reasons, the Unicode character `U+0000`" +- " must be replaced\n" +- with the REPLACEMENT CHARACTER ( - "`U+FFFD`).\n\n\n" - "## Backslash escapes\n\n" - "Any ASCII punctuation character may be backslash-escaped:\n" @@ -518,9 +525,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Valid HTML entity references and numeric character references\n" - "can be used in place of the corresponding Unicode character,\n" - "with the following exceptions:\n\n" -- "- Entity and character references are not recognized in code\n " -- blocks and code spans. -- "\n\n- Entity and character references cannot stand in place of\n " +- "- " +- "Entity and character references are not recognized in code\n blocks and code spans.\n\n" +- "- " +- "Entity and character references cannot stand in place of\n " - "special characters that define structural elements in\n CommonMark. For example, although " - "`*` can be used\n in place of a literal " - "`*` character, `*` cannot replace\n `*`" @@ -528,8 +536,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Conforming CommonMark parsers need not store information about\n" - "whether a particular character was represented in the source\n" - "using a Unicode character or an entity reference.\n\n" -- "[Entity references](@) consist of `&` + any " -- "of the valid\nHTML5 entity names + `;`. The\ndocument " +- "[Entity references](@) consist of `&`" +- " + any of the valid\nHTML5 entity names + `;`" +- ". The\ndocument " - "\nis used as an authoritative source for the valid entity\n" - "references and their corresponding code points.\n\n" @@ -543,25 +552,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

      & © Æ Ď\n¾ ℋ ⅆ\n" - "∲ ≧̸

    \n" - "````````````````" -- "````````````````\n\n" -- "\n[Decimal numeric character\nreferences](@)\nconsist of " -- "`&#` + a string of 1--7 arabic digits " -- "+ `;`" -- ". A\nnumeric character reference is parsed as the corresponding\n" +- "````````````````\n\n\n" +- "[Decimal numeric character\nreferences](@)\nconsist of " +- "`&#` + a string of 1--" +- "7 arabic digits + `;`. A\n" +- "numeric character reference is parsed as the corresponding\n" - "Unicode character. " - "Invalid Unicode code points will be replaced by\n" -- "the REPLACEMENT CHARACTER (`U+" -- "FFFD`). For security reasons,\nthe code point " -- "`U+0000` will also be replaced by `U+" -- "FFFD`.\n\n" +- the REPLACEMENT CHARACTER ( +- "`U+FFFD`). For security reasons,\nthe code point " +- "`U+0000` will also be replaced by " +- "`U+FFFD`.\n\n" - "````````````````" - "```````````````` " - "example\n" - "# Ӓ Ϡ &#" - "0;\n.\n

    # Ӓ Ϡ �

    \n" - "````````````````" -- "````````````````\n\n" -- "\n[Hexadecimal numeric character\nreferences](@) consist of " +- "````````````````\n\n\n" +- "[Hexadecimal numeric character\nreferences](@) consist of " - "`&#` +\neither `X` or `x`" - " + a string of 1-6 hexadecimal digits + `;`" - ".\nThey too are parsed as the corresponding Unicode character (this\n" @@ -592,8 +601,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Although HTML5 does accept some entity references\n" -- "without a trailing semicolon (such as `©`), these " -- "are not\nrecognized here, because it makes the grammar too ambiguous:\n\n" +- "without a trailing semicolon (such as `©`" +- "), these are not\nrecognized here, because it makes the grammar too ambiguous:\n\n" - "````````````````" - "```````````````` " - "example\n©\n.\n

    &copy

    \n" @@ -716,8 +725,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "quotations, lists, headings, rules, and code blocks. " - "Some blocks (like\n" - "block quotes and list items) contain other blocks; others (like\n" -- "headings and paragraphs) contain [inline](@) content-" -- "--text,\n" +- "headings and paragraphs) contain [inline](@) content" +- "---text,\n" - "links, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\n" - "Indicators of block structure always take precedence over indicators\n" @@ -741,8 +750,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one block element does not affect the inline parsing of any other.\n\n" - "## Container blocks and leaf blocks\n\n" - "We can divide blocks into two types:\n" -- "[container blocks](#container-blocks)" -- ",\nwhich can contain other blocks, and " +- "[container blocks](#container-blocks),\n" +- "which can contain other blocks, and " - "[leaf blocks](#leaf-blocks),\nwhich cannot.\n\n" - "# Leaf blocks\n\n" - "This section describes the different kinds of leaf block that make up a\n" @@ -750,8 +759,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Thematic breaks\n\n" - "A line consisting of optionally up to three spaces of indentation, followed " - "by a\nsequence of three or more matching `-`, `_`" -- ", or `*`" -- " characters, each followed\n" +- ", or `*` characters, each followed\n" - "optionally by any number of spaces or tabs, forms a\n" - "[thematic break](@).\n\n" - "````````````````" @@ -920,10 +928,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## ATX headings\n\n" -- "An [ATX heading](@)" -- "\nconsists of a string of characters, parsed as inline content, between an\n" -- "opening sequence of 1--6 unescaped `#` characters and an " -- "optional\nclosing sequence of any number of unescaped `#`" +- "An [ATX heading](@)\n" +- "consists of a string of characters, parsed as inline content, between an\n" +- "opening sequence of 1--6 unescaped `#`" +- " characters and an optional\nclosing sequence of any number of unescaped `#`" - " characters.\nThe opening sequence of `#`" - " characters must be followed by spaces or tabs, or\n" - "by the end of line. The optional closing sequence of `#`" @@ -959,8 +967,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ####### foo

    \n" - "````````````````" - "````````````````\n\n\n" -- "At least one space or tab is required between the `#` characters and " -- "the\nheading's contents, unless the heading is empty. Note that many\n" +- "At least one space or tab is required between the `#`" +- " characters and the\nheading'" +- "s contents, unless the heading is empty. Note that many\n" - "implementations currently do not require the space. However, the\n" - "space was required by the\n" - "[original ATX implementation](http://" @@ -1058,8 +1067,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n" - "````````````````" - "````````````````\n\n\n" -- "A sequence of `#`" -- " characters with anything but spaces or tabs following it\n" +- "A sequence of `#` characters with anything but spaces or tabs following it\n" - "is not a closing sequence, but counts as part of the contents of the\n" - "heading:\n\n" - "````````````````" @@ -1078,8 +1086,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo#

    \n" - "````````````````" - "````````````````\n\n\n" -- "Backslash-escaped `#`" -- " characters do not count as part\nof the closing sequence:\n\n" +- "Backslash-escaped `#` characters do not count as part\n" +- "of the closing sequence:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1121,8 +1129,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Setext headings\n\n" -- "A [setext heading](@)" -- " consists of one or more\n" +- "A [setext heading](@) consists of one or more\n" - "lines of text, not interrupted by a blank line, of which the first line " - "does not\nhave more than 3 spaces of indentation, followed by\na [" - "setext heading underline]. The lines of text must be such\n" @@ -1133,13 +1140,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "thematic breaks],\n[list item][list items], or [" - "HTML block][HTML blocks].\n\n" - "A [setext heading underline](@) is a sequence of\n" -- "`=` characters or a sequence of `-` characters, with no more " -- "than 3\n" -- spaces of indentation and any number of trailing spaces or tabs. -- "\n\nThe heading is a level 1 heading if `=`" -- " characters are used in\nthe [setext heading underline]" -- ", and a level 2 heading if `-`" -- "\ncharacters are used. The contents of the heading are the result\n" +- "`=` characters or a sequence of `-`" +- " characters, with no more than 3\n" +- "spaces of indentation and any number of trailing spaces or tabs.\n\n" +- "The heading is a level 1 heading if `=` characters are used in\nthe " +- "[setext heading underline], and a level 2 heading if " +- "`-`\ncharacters are used. The contents of the heading are the result\n" - "of parsing the preceding lines of text as CommonMark inline\ncontent.\n\n" - "In general, a setext heading need not be preceded or followed by a\n" - "blank line. However, it cannot interrupt a paragraph, so when a\n" @@ -1360,8 +1366,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n" - "````````````````" - "````````````````\n\n\n" -- "If you want a heading with `> foo` as its literal text, " -- "you can\nuse backslash escapes:\n\n" +- "If you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1369,8 +1375,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    > foo

    \n" - "````````````````" - "````````````````\n\n\n" -- "**Compatibility note:**" -- " Most existing Markdown implementations\n" +- "**Compatibility note:** Most existing Markdown implementations\n" - "do not allow the text of setext headings to span multiple lines.\n" - "But there is no consensus about how to interpret\n\n" - "``` markdown\nFoo\nbar\n---\nbaz\n```" @@ -1423,10 +1428,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Indented code blocks\n\n" -- "An [indented code block](@) is composed of one or " -- "more\n[indented chunks] separated by blank lines.\nAn " -- "[indented chunk](@) is a sequence of non-blank " -- "lines,\n" +- "An [indented code block](@)" +- " is composed of one or more\n[indented chunks]" +- " separated by blank lines.\nAn [indented chunk](@)" +- " is a sequence of non-blank lines,\n" - "each preceded by four or more spaces of indentation. " - "The contents of the code\nblock are the literal contents of the lines, including trailing\n" - "[line endings], minus four spaces of indentation.\n" @@ -1549,14 +1554,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## Fenced code blocks\n\n" -- "A [code fence](@)" -- " is a sequence\nof at least three consecutive backtick characters (" -- "`` ` ``) or\ntildes (`~`" +- "A [code fence](@) is a sequence\n" +- "of at least three consecutive backtick characters (`` ` ``) or\n" +- "tildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA " -- "[fenced code block](@)" -- "\n" +- "[fenced code block](@)\n" - "begins with a code fence, preceded by up to three spaces of indentation" -- ".\n\nThe line with the opening code fence may optionally contain some text\n" +- ".\n\n" +- "The line with the opening code fence may optionally contain some text\n" - "following the code fence; this is trimmed of leading and trailing\n" - "spaces or tabs and called the [info string](@)" - ". If the [info string] comes\n" @@ -1588,8 +1593,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "much less efficient, and there seems to be no real downside to the\n" - "behavior described here.)\n\n" - "A fenced code block may interrupt a paragraph, and does not require\n" -- a blank line either before or after. -- "\n\nThe content of a code fence is treated as literal text, not parsed\n" +- "a blank line either before or after.\n\n" +- "The content of a code fence is treated as literal text, not parsed\n" - "as inlines. The first word of the [info string]" - " is typically used to\nspecify the language of the code sample, and rendered in the " - "`class`\nattribute of the `code`" @@ -1848,8 +1853,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo

    \n" - "````````````````" - "````````````````\n\n\n" -- "[Info strings] for tilde code blocks can contain backticks and " -- "tildes:\n\n" +- "[Info strings]" +- " for tilde code blocks can contain backticks and tildes:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1870,40 +1875,44 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## HTML blocks\n\n" -- "An [HTML block](@) is a group of lines that " -- "is treated\n" +- "An [HTML block](@)" +- " is a group of lines that is treated\n" - as raw HTML (and will not be escaped in HTML output -- ").\n\nThere are seven kinds of [HTML block]" +- ").\n\n" +- "There are seven kinds of [HTML block]" - ", which can be defined by their\n" - "start and end conditions. The block begins with a line that meets a\n" -- "[start condition](@) (after up to three optional spaces of " -- "indentation).\nIt ends with the first subsequent line that meets a matching\n" -- "[end condition](@), or the last line of the document, " -- "or the last line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the " -- "[end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line." -- "\n\n1. " +- "[start condition](@)" +- " (after up to three optional spaces of indentation).\n" +- "It ends with the first subsequent line that meets a matching\n" +- "[end condition](@)" +- ", or the last line of the document, or the last line of\nthe " +- "[container block](#container-blocks) containing the current HTML\n" +- "block, if no line is encountered that meets the [end condition]. If\n" +- "the first line meets both the [start condition] and the [end\ncondition]" +- ", the block will contain just that line.\n\n" +- "1. " - "**Start condition:** line begins with the string ``" +- "``" - ", or the end of the line.\\\n**End condition:**" - " line contains an end tag\n`
    `, " -- "``, ``, or `" -- "` (case-insensitive; it\n" +- "``, ``, or " +- "`` (case-insensitive; it\n" - "need not match the start tag).\n\n" - "2. " - "**Start condition:** line begins with the string ``.\n\n" - "3. " -- "**Start condition:** line begins with the string ``.\n\n" - "4. " -- "**Start condition:** line begins with the string ``.\n\n" - "5. " - "**Start condition:** line begins with the string\n" @@ -1912,42 +1921,41 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ".\n\n" - "6. " - "**Start condition:** line begins with the string `<` or " -- "``, or\nthe string `/>`.\\\n" - "**End condition:** line is followed by a [blank line]" - ".\n\n" - "7. " -- "**Start condition:**" -- " line begins with a complete [open tag]\n(with any [tag name]" -- " other than `pre`, `script`,\n`style`, or " -- "`textarea`" -- ") or a complete [closing tag],\n" +- "**Start condition:** line begins with a complete [open tag]\n" +- "(with any [tag name] other than `pre`, `script`" +- ",\n`style`, or `textarea`) or a complete [" +- "closing tag],\n" - "followed by zero or more spaces and tabs, followed by the end of the " -- "line.\\\n**End condition:**" -- " line is followed by a [blank line]." -- "\n\nHTML blocks continue until they are closed by their appropriate\n[end condition]" +- "line.\\\n**End condition:** line is followed by a [" +- "blank line].\n\n" +- "HTML blocks continue until they are closed by their appropriate\n[end condition]" - ", or the last line of the document or other " - "[container\nblock](#container-blocks)" - ". This means any HTML " @@ -1956,8 +1964,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "be ignored by the parser and passed through as-is, without changing\n" - "the parser's state.\n\n" - "For instance, `
    ` within an HTML block started by "
    -- "``"
    -- " will not affect\n"
    +- "`
    ` will not affect\n" - "the parser state; as the HTML block was started in by start " - "condition 6, it\nwill end at any blank line. This can be surprising:\n\n" - "````````````````" @@ -1974,8 +1981,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n" - "In this case, the HTML block is terminated by the blank line — " -- "the `**Hello**`" -- "\n" +- "the `**Hello**`\n" - "text remains verbatim — and regular parsing resumes, with a paragraph" - ",\nemphasised `world` and inline and block HTML following.\n\n" - "All types of [HTML blocks] except type 7 may interrupt\n" @@ -2109,8 +2115,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "int x = 33;\n```\n" - "````````````````" - "````````````````\n\n\n" -- "To start an [HTML block] with a tag that is *not" -- "* in the\n" +- "To start an [HTML block] with a tag that is " +- "*not* in the\n" - "list of block-level tags in (6), you must put the tag " - "by\nitself on the first line (and it must be complete):\n\n" - "````````````````" @@ -2148,8 +2154,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`` tag is a nice example. We can surround content with\n" - "``" - " tags in three different ways. In this case, we get a raw\n" -- "HTML block, because the `` tag is on a " -- "line by itself:\n\n" +- "HTML block, because the ``" +- " tag is on a line by itself:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2158,8 +2164,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "In this case, we get a raw HTML block that just includes\nthe " -- "``" -- " tag (because it ends with the following blank\n" +- "`` tag (because it ends with the following blank\n" - "line). So the contents get interpreted as CommonMark:\n\n" - "````````````````" - "```````````````` " @@ -2169,8 +2174,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "Finally, in this case, the ``" -- " tags are interpreted\nas [raw HTML] *inside*" +- "Finally, in this case, the `` tags are interpreted\nas " +- "[raw HTML] *inside*" - " the CommonMark paragraph. (Because\n" - "the tag is not on a line by itself, we get inline " - "HTML\nrather than an [HTML block].)\n\n" @@ -2183,8 +2188,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "HTML tags designed to contain literal content\n(`pre`, " -- "`script`, `style`, `textarea`), comments" -- ", processing instructions,\nand declarations are treated somewhat differently.\n" +- "`script`, `style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently.\n" - "Instead of ending at the first blank line, these blocks\n" - "end at the first line containing a corresponding end tag.\n" - "As a result, these blocks can contain blank lines:\n\n" @@ -2361,8 +2366,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "An HTML block of types 1--6 can interrupt a paragraph, " -- "and need not be\npreceded by a blank line.\n\n" +- An HTML block of types 1-- +- "6 can interrupt a paragraph, and need not be\npreceded by a blank line.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2392,23 +2397,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "This rule differs from John Gruber's original Markdown syntax\n" -- "specification, which says:" -- "\n\n> The only restrictions are that block-level HTML elements —\n> " +- "specification, which says:\n\n" +- "> " +- "The only restrictions are that block-level HTML elements —\n> " - "e.g. `
    `, `
    `" -- ", `
    `, `

    `, etc. — " -- "must be separated from\n> " +- ", `

    `, `

    `" +- ", etc. — must be separated from\n> " - "surrounding content by blank lines, and the start and end tags of the\n> " - "block should not be indented with spaces or tabs.\n\n" -- "In some ways Gruber's rule is more restrictive than the one " -- "given\nhere:\n\n" -- "- It requires that an HTML block be preceded by a blank line." -- "\n- It does not allow the start tag to be indented.\n" +- "In some ways Gruber'" +- "s rule is more restrictive than the one given\nhere:\n\n" +- "- It requires that an HTML block be preceded by a blank line.\n" +- "- It does not allow the start tag to be indented.\n" - "- It requires a matching end tag, which it also does not allow to\n " - "be indented.\n\n" -- "Most Markdown implementations (including some of Gruber's own) " -- "do not\nrespect all of these restrictions.\n\n" -- "There is one respect, however, in which Gruber's rule is " -- "more liberal\nthan the one given here, since it allows blank lines to occur inside\n" +- "Most Markdown implementations (including some of Gruber'" +- "s own) do not\nrespect all of these restrictions.\n\n" +- "There is one respect, however, in which Gruber'" +- "s rule is more liberal\n" +- "than the one given here, since it allows blank lines to occur inside\n" - "an HTML block. " - "There are two reasons for disallowing them here.\n" - "First, it removes the need to parse balanced tags, which is\n" @@ -2437,14 +2444,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "Some Markdown implementations have adopted a convention of\n" - "interpreting content inside tags as text if the open tag has\nthe attribute " -- "`markdown=1`" -- ". The rule given above seems a simpler and\n" +- "`markdown=1`. The rule given above seems a simpler and\n" - "more elegant way of achieving the same expressive power, which is also\n" -- much simpler to parse. -- "\n\nThe main potential drawback is that one can no longer paste HTML\n" +- "much simpler to parse.\n\n" +- "The main potential drawback is that one can no longer paste HTML\n" - "blocks into Markdown documents with 100% reliability. However,\n" -- "*in most cases*" -- " this will work fine, because the blank lines in\n" +- "*in most cases* this will work fine, because the blank lines in\n" - "HTML are usually followed by HTML block tags. For example:\n\n" - "````````````````" - "```````````````` " @@ -2456,8 +2461,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "There are problems, however, if the inner tags are indented\n" -- "*and*" -- " separated by spaces, as then they will be interpreted as\n" +- "*and* separated by spaces, as then they will be interpreted as\n" - "an indented code block:\n\n" - "````````````````" - "```````````````` " @@ -2472,16 +2476,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Fortunately, blank lines are usually not necessary and can be\n" -- "deleted. The exception is inside `

    `"
    -- " tags, but as described\n[above][HTML blocks]"
    +- "deleted.  The exception is inside `
    ` tags, but as described\n"
    +- "[above][HTML blocks]"
     - ", raw HTML blocks starting with `
    `\n*can*"
     - " contain blank lines.\n\n"
     - "## Link reference definitions\n\n"
    -- "A [link reference definition](@)"
    -- "\nconsists of a [link label]"
    +- "A [link reference definition](@)\nconsists of a [link label]"
     - ", optionally preceded by up to three spaces of\nindentation, followed\n"
    -- "by a colon (`:`), optional spaces or tabs ("
    -- "including up to one\n[line ending]), a [link destination],\n"
    +- "by a colon (`:`"
    +- "), optional spaces or tabs (including up to one\n[line ending]"
    +- "), a [link destination],\n"
     - "optional spaces or tabs (including up to one\n[line ending]"
     - "), and an optional [link\ntitle]"
     - ", which if it is present must be separated\nfrom the [link destination]"
    @@ -2779,8 +2783,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````\n\n\n"
     - "## Paragraphs\n\n"
     - "A sequence of non-blank lines that cannot be interpreted as other\n"
    -- "kinds of blocks forms a [paragraph](@)"
    -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
    +- "kinds of blocks forms a [paragraph](@).\n"
    +- "The contents of the paragraph are the result of parsing the\nparagraph'"
     - "s raw content as inlines.  The paragraph's raw content\n"
     - "is formed by concatenating the lines and removing initial and final\n"
     - "spaces or tabs.\n\nA simple example with two paragraphs:\n"
    @@ -2868,29 +2872,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````\n\n\n\n"
     - "# Container blocks\n\n"
    -- "A [container block](#container-blocks) is a block that has "
    -- "other\nblocks as its contents.  There are two basic kinds of container blocks:\n["
    +- "A [container block](#container-blocks)"
    +- " is a block that has other\n"
    +- "blocks as its contents.  There are two basic kinds of container blocks:\n["
     - "block quotes] and [list items].\n[Lists]"
     - " are meta-containers for [list items].\n\n"
     - "We define the syntax for container blocks recursively.  The general\n"
     - "form of the definition is:\n\n"
    -- "> If X is a sequence of blocks, then the result of\n> "
    +- "> "
    +- "If X is a sequence of blocks, then the result of\n> "
     - transforming X in such-and-such a way is a container of type Y
    -- "\n> with these blocks as its content."
    -- "\n\nSo, we explain what counts as a block quote or list item by explaining\n"
    -- how these can be *generated*
    -- " from their contents. This should suffice\n"
    -- "to define the syntax, although it does not give a recipe for *parsing"
    -- "*\nthese constructions.  (A recipe is provided below in the section entitled\n"
    +- "\n> with these blocks as its content.\n\n"
    +- "So, we explain what counts as a block quote or list item by explaining\n"
    +- "how these can be *generated* from their contents. This should suffice\n"
    +- "to define the syntax, although it does not give a recipe for "
    +- "*parsing*\n"
    +- "these constructions.  (A recipe is provided below in the section entitled\n"
     - "[A parsing strategy](#appendix-a-parsing"
     - "-strategy).)\n\n"
     - "## Block quotes\n\n"
    -- "A [block quote marker](@)"
    -- ",\noptionally preceded by up to three spaces of indentation,\n"
    -- "consists of (a) the character `>`"
    -- " together with a following space of\n"
    -- "indentation, or (b) a single character `>` not followed "
    -- "by a space of\nindentation.\n\n"
    +- "A [block quote marker](@),\n"
    +- "optionally preceded by up to three spaces of indentation,\n"
    +- "consists of (a) the character `>` together with a following space of\n"
    +- "indentation, or (b) a single character `>`"
    +- " not followed by a space of\nindentation.\n\n"
     - "The following rules define [block quotes]:\n\n"
     - "1.  "
     - "**Basic case.**  If a string of lines *Ls*"
    @@ -2901,20 +2906,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "2.  "
     - "**Laziness.**  If a string of lines "
     - "*Ls* constitute a [block\n    quote](#block-quotes)"
    -- " with contents *Bs*"
    -- ", then the result of deleting\n    the initial [block quote marker]"
    -- " from one or\n    "
    +- " with contents *Bs*, then the result of deleting\n    the initial ["
    +- "block quote marker] from one or\n    "
     - "more lines in which the next character other than a space or tab after the\n    "
     - "[block quote marker] is [paragraph continuation\n    text] is a block quote with "
     - "*Bs* as its content.\n    "
    -- "[Paragraph continuation text](@)"
    -- " is text\n    "
    +- "[Paragraph continuation text](@) is text\n    "
     - "that will be parsed as part of the content of a paragraph, but does\n    "
     - "not occur at the beginning of the paragraph.\n\n"
     - "3.  "
    -- "**Consecutiveness.**"
    -- "  A document cannot contain two [block\n    quotes] in a row unless there is a "
    -- "[blank line] between them.\n\n"
    +- "**Consecutiveness.**  A document cannot contain two ["
    +- "block\n    quotes] in a row unless there is a [blank line]"
    +- " between them.\n\n"
     - "Nothing else counts as a [block quote](#block-quotes).\n"
     - "\nHere is a simple example:\n"
     - "\n"
    @@ -2936,8 +2939,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "baz

    \n\n" - "````````````````" - "````````````````\n\n\n" -- "The `>` characters can be preceded by up to three spaces of " -- "indentation:\n\n" +- "The `>`" +- " characters can be preceded by up to three spaces of indentation:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2957,8 +2960,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n" - "````````````````" - "````````````````\n\n\n" -- "The Laziness clause allows us to omit the `>`" -- " before\n[paragraph continuation text]:\n\n" +- "The Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3004,8 +3007,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • bar
  • \n\n" - "````````````````" - "````````````````\n\n\n" -- "For the same reason, we can't omit the `> ` in " -- "front of\nsubsequent lines of an indented or fenced code block:\n\n" +- "For the same reason, we can't omit the `> `" +- " in front of\nsubsequent lines of an indented or fenced code block:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3075,8 +3078,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "(Most current Markdown implementations, including John Gruber's\noriginal " -- "`Markdown.pl`, will parse this example as a single " -- "block quote\n" +- "`Markdown.pl`" +- ", will parse this example as a single block quote\n" - "with two paragraphs. But it seems better to allow the author to decide\n" - "whether two block quotes or one are wanted.)\n\n" - "Consecutiveness means that if we put these block quotes together,\n" @@ -3146,8 +3149,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "It is a consequence of the Laziness rule that any number\nof initial " -- "`>`" -- "s may be omitted on a continuation line of a\nnested block quote:\n\n" +- "`>`s may be omitted on a continuation line of a\n" +- "nested block quote:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3170,8 +3173,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "When including an indented code block in a block quote,\nremember that the " - "[block quote marker] includes\nboth the `>`" -- " and a following space of indentation. So *five spaces*" -- " are needed\nafter the `>`:\n\n" +- " and a following space of indentation. So *five spaces* are needed\n" +- "after the `>`:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3182,26 +3185,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## List items\n\n" -- "A [list marker](@)" -- " is a\n[bullet list marker] or an [ordered list marker].\n\n" +- "A [list marker](@) is a\n[bullet list marker]" +- " or an [ordered list marker].\n\n" - "A [bullet list marker](@)\nis a `-`, " - "`+`, or `*` character.\n\n" -- "An [ordered list marker](@)" -- "\nis a sequence of 1--9 arabic digits (" -- "`0-9`), followed by either a\n`.`" -- " character or a `)`" +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`" +- "), followed by either a\n`.` character or a `)`" - " character. (The reason for the length\n" - "limit is that with 10 digits we start seeing integer overflows\n" - "in some browsers.)\n\nThe following rules define [list items]:\n\n" - "1. " - "**Basic case.** If a sequence of lines *Ls*" - " constitute a sequence of\n blocks *Bs*" -- " starting with a character other than a space or tab, and *M* " -- "is\n a list marker of width *W* followed by 1 ≤ *N*" +- " starting with a character other than a space or tab, and *M*" +- " is\n a list marker of width *W* followed by 1 ≤ *N*" - " ≤ 4 spaces of indentation,\n then the result of prepending " - "*M* and the following spaces to the first line\n of *Ls*" -- ", and indenting subsequent lines of *Ls* by *W + " -- "N* spaces, is a\n list item with *Bs*" +- ", and indenting subsequent lines of *Ls* by " +- "*W + N* spaces, is a\n list item with *Bs*" - " as its contents. The type of the list item\n " - "(bullet or ordered) is determined by the type of its list marker.\n " - "If the list item is ordered, then it is also assigned a start\n " @@ -3209,8 +3211,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "1. When the first list item in a [list] interrupts\n a paragraph" - "---that is, when it starts on a line that would\n " - "otherwise count as [paragraph continuation text]---then (a)\n " -- "the lines *Ls* must not begin with a blank line, and (" -- "b) if\n the list item is ordered, the start number must be 1.\n " +- the lines *Ls* +- " must not begin with a blank line, and (b) if\n " +- "the list item is ordered, the start number must be 1.\n " - "2. If any line is a [thematic break][thematic breaks" - "] then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n" @@ -3226,8 +3229,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "And let *M* be the marker `1.`, and *N" -- "* = 2. Then rule #1 says\n" +- "And let *M* be the marker `1.`, and " +- "*N* = 2. Then rule #1 says\n" - "that the following is an ordered list item with start number 1,\n" - "and the same contents as *Ls*:\n\n" - "````````````````" @@ -3304,14 +3307,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Here `two` occurs in the same column as the list marker " -- "`1.`" -- ",\nbut is actually contained in the list item, because there is\n" -- sufficient indentation after the last containing blockquote marker. -- "\n\nThe converse is also possible. In the following example, the word " -- "`two`" -- "\noccurs far to the right of the initial text of the list item, " -- "`one`" -- ", but\n" +- "`1.`,\n" +- "but is actually contained in the list item, because there is\n" +- "sufficient indentation after the last containing blockquote marker.\n\n" +- "The converse is also possible. In the following example, the word " +- "`two`\n" +- "occurs far to the right of the initial text of the list item, " +- "`one`, but\n" - "it is not considered part of the list item, because it is not " - "indented\nfar enough past the blockquote marker:\n\n" - "````````````````" @@ -3410,16 +3412,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "-1. not ok\n.\n" - "

    -1. not ok

    \n" - "````````````````" -- "````````````````\n\n" -- "\n\n2. **Item starting with indented code.**" +- "````````````````\n\n\n\n" +- 2. **Item starting with indented code.** - " If a sequence of lines *Ls*\n constitute a sequence of blocks " - "*Bs* starting with an indented code\n block, and " -- "*M* is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending " +- "*M* is a list marker of width *W* followed by\n " +- "one space of indentation, then the result of prepending " - "*M* and the\n following space to the first line of *Ls*" - ", and indenting subsequent lines\n of *Ls* by " -- "*W + 1* spaces, is a list item with *Bs* " -- "as its contents.\n " +- "*W + 1* spaces, is a list item with *Bs*" +- " as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - "list item (bullet or ordered) is determined by the type of its list\n " @@ -3449,9 +3451,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "````````````````" - "````````````````\n\n\n" -- "If the *first* block in the list item is an indented code " -- "block,\nthen by rule #2, the contents must be preceded by " -- "*one* space of indentation\nafter the list marker:\n\n" +- If the *first* +- " block in the list item is an indented code block,\n" +- "then by rule #2, the contents must be preceded by *one*" +- " space of indentation\nafter the list marker:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3518,16 +3521,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    bar

    \n\n" - "\n" - "````````````````" -- "````````````````\n\n" -- "\n3. **Item starting with a blank line.**" -- " If a sequence of lines *Ls*" -- "\n starting with a single [blank line] constitute a (possibly empty)\n " -- "sequence of blocks *Bs*, and *M* is a list marker " -- "of width *W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of " -- "*Ls* by *W + 1* spaces of indentation, " -- "is a\n list item with *Bs*" -- " as its contents.\n " +- "````````````````\n\n\n" +- 3. **Item starting with a blank line.** +- " If a sequence of lines *Ls*\n starting with a single [blank line" +- "] constitute a (possibly empty)\n sequence of blocks *Bs*, and " +- "*M* is a list marker of width *W*,\n " +- "then the result of prepending *M* to the first line of " +- "*Ls*, and\n preceding subsequent lines of *Ls* by " +- "*W + 1* spaces of indentation, is a\n list item with " +- "*Bs* as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - "list item (bullet or ordered) is determined by the type of its list\n " @@ -3557,8 +3559,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "A list item can begin with at most one blank line.\n" -- "In the following example, `foo`" -- " is not part of the list\nitem:\n\n" +- "In the following example, `foo` is not part of the list\n" +- "item:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3616,10 +3618,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n*\n\nfoo\n1.\n.\n

    foo\n" - "*

    \n

    foo\n1.

    \n" - "````````````````" -- "````````````````\n\n" -- "\n4. **Indentation.** If a sequence of lines " -- "*Ls*" -- " constitutes a list item\n " +- "````````````````\n\n\n" +- "4. **Indentation.** If a sequence of lines " +- "*Ls* constitutes a list item\n " - "according to rule #1, #2, or #3, then the result " - "of preceding each line\n of *Ls*" - " by up to three spaces of indentation (the same for each line) " @@ -3680,11 +3681,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " with two lines.\n\n indented code\n\n > A block quote.\n" - "
    \n" - "````````````````" -- "````````````````\n\n" -- "\n\n5. **Laziness.** If a string of lines " +- "````````````````\n\n\n\n" +- "5. **Laziness.** If a string of lines " - "*Ls* constitute a [list\n item](#list-items)" -- " with contents *Bs*" -- ", then the result of deleting\n " +- " with contents *Bs*, then the result of deleting\n " - "some or all of the indentation from one or more lines in which the\n " - "next character other than a space or tab after the indentation is\n [" - "paragraph continuation text] is a\n " @@ -3738,8 +3738,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "\n" - "````````````````" -- "````````````````\n\n" -- "\n\n6. **That's all.**" +- "````````````````\n\n\n\n" +- "6. **That's all.**" - " Nothing that is not counted as a list item by rules\n #1--" - "5 counts as a [list item](#list-items).\n\n" - "The rules for sublists follow from the general rules\n[above][List items" @@ -3826,30 +3826,33 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "### Motivation\n\n" -- "John Gruber's Markdown spec says the following about list items" -- ":\n\n" -- "1. \"" -- "List markers typically start at the left margin, but may be indented\n " +- "John Gruber'" +- "s Markdown spec says the following about list items:\n\n" +- "1. " +- "\"List markers typically start at the left margin, but may be indented\n " - "by up to three spaces. List markers must be followed by one or more\n " - "spaces or a tab.\"\n\n" -- "2. \"" -- "To make lists look nice, you can wrap items with hanging indents." -- "...\n But if you don't want to, you don'" +- "2. " +- "\"To make lists look nice, you can wrap items with hanging indents" +- "....\n But if you don't want to, you don'" - "t have to.\"\n\n" -- "3. \"List items may consist of multiple paragraphs. Each subsequent\n " +- "3. " +- "\"List items may consist of multiple paragraphs. Each subsequent\n " - "paragraph in a list item must be indented by either 4 spaces or one\n " - "tab.\"\n\n" -- "4. \"" -- "It looks nice if you indent every line of the subsequent paragraphs,\n " -- "but here again, Markdown will allow you to be lazy.\"" -- "\n\n5. \"" +- "4. " +- "\"It looks nice if you indent every line of the subsequent paragraphs,\n " +- "but here again, Markdown will allow you to be lazy.\"\n\n" +- "5. " +- "\"" - "To put a blockquote within a list item, the " -- "blockquote's `>`" -- "\n delimiters need to be indented.\"\n\n" -- "6. \"" +- "blockquote's `>`\n " +- "delimiters need to be indented.\"\n\n" +- "6. " +- "\"" - "To put a code block within a list item, the code block needs to be\n " -- "indented twice — 8 spaces or two tabs.\"" -- "\n\nThese rules specify that a paragraph under a list item must be indented\n" +- "indented twice — 8 spaces or two tabs.\"\n\n" +- "These rules specify that a paragraph under a list item must be indented\n" - "four spaces (presumably, from the left margin, rather than the start of\n" - "the list marker, but this is not said), and that code under a " - "list item\n" @@ -3857,14 +3860,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that a block quote must be indented, but not by how much; " - "however, the\nexample given has four spaces indentation. Although nothing is said\n" - "about other kinds of block-level content, it is certainly reasonable to\n" -- infer that *all* -- " block elements under a list item, including other\n" +- "infer that *all* block elements under a list item, including other\n" - "lists, must be indented four spaces. This principle has been called the\n" - "*four-space rule*.\n\n" - "The four-space rule is clear and principled, and if the reference\nimplementation " -- "`Markdown.pl`" -- " had followed it, it probably would have\nbecome the standard. However, " -- "`Markdown.pl`" +- "`Markdown.pl` had followed it, it probably would have\n" +- "become the standard. However, `Markdown.pl`" - " allowed paragraphs and\n" - "sublists to start with only two spaces indentation, at least on the\n" - "outer level. Worse, its behavior was inconsistent: a sublist of an\n" @@ -3873,17 +3874,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "implementations of Markdown have developed very different rules for\n" - "determining what comes under a list item. " - "(Pandoc and python-Markdown,\n" -- "for example, stuck with Gruber's syntax description and the four-" -- "space\n" +- "for example, stuck with Gruber'" +- "s syntax description and the four-space\n" - "rule, while discount, redcarpet, marked, PHP Markdown, " -- "and others\nfollowed `Markdown.pl`" -- "'s behavior more closely.)" -- "\n\nUnfortunately, given the divergences between implementations, there\n" +- "and others\nfollowed `Markdown.pl`'" +- "s behavior more closely.)\n\n" +- "Unfortunately, given the divergences between implementations, there\n" - "is no way to give a spec for list items that will be guaranteed not\n" - "to break any existing documents. However, the spec given here should\n" - "correctly handle lists formatted with either the four-space rule or\n" -- "the more forgiving `Markdown.pl` behavior, provided they " -- "are laid out\nin a way that is natural for a human to read.\n\n" +- "the more forgiving `Markdown.pl`" +- " behavior, provided they are laid out\n" +- "in a way that is natural for a human to read.\n\n" - "The strategy here is to let the width and indentation of the list marker\n" - "determine the indentation necessary for blocks to fall under the list\n" - "item, rather than having a fixed and arbitrary number. The writer can\n" @@ -3898,36 +3900,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "unnatural. It is quite unintuitive that\n\n" - "``` markdown\n- foo\n\n bar\n\n - baz\n```" - "\n\nshould be parsed as two lists with an intervening paragraph,\n" -- "\n``` html\n
      \n" -- "
    • foo
    • \n
    \n" -- "

    bar

    \n
      \n" +- "\n``` html\n" +- "
        \n
      • foo
      • \n" +- "
      \n

      bar

      \n
        \n" - "
      • baz
      • \n
      \n" - "```\n\n" - "as the four-space rule demands, rather than a single list,\n" -- "\n``` html\n
        \n
      • \n" -- "

        foo

        \n" +- "\n``` html\n" +- "
          \n
        • \n

          foo

          \n" - "

          bar

          \n
            \n" - "
          • baz
          • \n
          \n" - "
        • \n
        \n```\n\n" - "The choice of four spaces is arbitrary. It can be learned, but it is\n" -- "not likely to be guessed, and it trips up beginners regularly." -- "\n\nWould it help to adopt a two-space rule? The problem is that such\n" +- "not likely to be guessed, and it trips up beginners regularly.\n\n" +- "Would it help to adopt a two-space rule? The problem is that such\n" - "a rule, together with the rule allowing up to three spaces of indentation " - "for\nthe initial list marker, allows text that is indented " -- "*less than*" -- " the\noriginal list marker to be included in the list item. For example,\n" +- "*less than* the\n" +- "original list marker to be included in the list item. For example,\n" - "`Markdown.pl` parses\n\n" - "``` markdown\n - one\n\n two\n```" - "\n\nas a single list item, with `two` a continuation paragraph:\n" -- "\n``` html\n
          \n
        • \n" -- "

          one

          \n

          two

          \n" -- "
        • \n
        \n```\n\nand similarly\n" -- "\n``` markdown\n> - one\n>\n> two\n```\n\nas\n" -- "\n``` html\n
        \n
          \n" -- "
        • \n

          one

          \n" +- "\n``` html\n" +- "
            \n
          • \n

            one

            \n" - "

            two

            \n
          • \n" -- "
          \n
        \n```\n\n" -- "This is extremely unintuitive.\n" +- "
      \n```\n\nand similarly\n" +- "\n``` markdown\n> - one\n>\n> two\n```\n\nas\n" +- "\n``` html\n" +- "
      \n
        \n
      • \n" +- "

        one

        \n

        two

        \n" +- "
      • \n
      \n
      \n" +- "```\n\nThis is extremely unintuitive.\n" - "\nRather than requiring a fixed indent from the margin, we could require\n" - "a fixed indent (say, two spaces, or even one space) from " - "the list marker (which\n" @@ -3948,10 +3951,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "where the code is indented eight spaces. " - "The spec above, by contrast, will\n" -- "parse this text as expected, since the code block's indentation " -- "is measured\nfrom the beginning of `foo`.\n\n" -- The one case that needs special treatment is a list item that *starts* -- "\n" +- "parse this text as expected, since the code block'" +- "s indentation is measured\nfrom the beginning of `foo`.\n\n" +- "The one case that needs special treatment is a list item that *starts*\n" - "with indented code. " - "How much indentation is required in that case, since\nwe don'" - "t have a \"first paragraph\"" @@ -3962,30 +3964,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "four-space rule in cases where the list marker plus its initial indentation\n" - "takes four spaces (a common case), but diverge in other cases.\n\n" - "## Lists\n\n" -- "A [list](@)" -- " is a sequence of one or more\nlist items [of the same type]" -- ". The list items\nmay be separated by any number of blank lines." -- "\n\nTwo list items are [of the same type](@)" -- "\nif they begin with a [list marker] of the same type.\n" +- "A [list](@) is a sequence of one or more\nlist items " +- "[of the same type]. The list items\n" +- "may be separated by any number of blank lines.\n\n" +- "Two list items are [of the same type](@)\n" +- "if they begin with a [list marker] of the same type.\n" - "Two list markers are of the\n" - "same type if (a) they are bullet list markers using the same character\n(" -- "`-`, `+`, or `*`) or (b" -- ") they are ordered list numbers with the same\ndelimiter (either " -- "`.` or `)`).\n\n" -- "A list is an [ordered list](@)" -- "\nif its constituent list items begin with\n[ordered list markers], and a\n" -- "[bullet list](@)" -- " if its constituent list\nitems begin with [bullet list markers].\n\n" -- "The [start number](@)" -- "\nof an [ordered list] is determined by the list number of\n" +- "`-`, `+`, or `*`" +- ") or (b) they are ordered list numbers with the same\n" +- "delimiter (either `.` or `)`).\n\n" +- "A list is an [ordered list](@)\n" +- "if its constituent list items begin with\n[ordered list markers], and a\n" +- "[bullet list](@) if its constituent list\nitems begin with [" +- "bullet list markers].\n\n" +- "The [start number](@)\nof an [ordered list]" +- " is determined by the list number of\n" - "its initial list item. The numbers of subsequent list items are\n" - "disregarded.\n\n" -- "A list is [loose](@)" -- " if any of its constituent\n" +- "A list is [loose](@) if any of its constituent\n" - "list items are separated by blank lines, or if any of its constituent\n" - "list items directly contain two block-level elements with a blank line\n" -- "between them. Otherwise a list is [tight](@)" -- ".\n" +- "between them. Otherwise a list is [tight](@).\n" - "(The difference in HTML output is that paragraphs in a loose list " - "are\nwrapped in `

      `" - " tags, while paragraphs in a tight list are not.)\n\n" @@ -4022,33 +4022,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    • baz
    • \n
    \n" - "````````````````" - "````````````````\n\n" -- "`Markdown.pl` does not allow this, through fear of " -- "triggering a list\nvia a numeral in a hard-wrapped line:\n\n" -- "``` markdown\nThe number of windows in my house is\n14. " +- "`Markdown.pl`" +- " does not allow this, through fear of triggering a list\n" +- "via a numeral in a hard-wrapped line:\n\n" +- "``` markdown\n" +- "The number of windows in my house is\n14. " - "The number of doors is 6.\n```\n\n" -- "Oddly, though, `Markdown.pl` *does* allow " -- "a blockquote to\ninterrupt a paragraph, even though the same considerations might\n" -- "apply.\n\n" +- "Oddly, though, `Markdown.pl` *does*" +- " allow a blockquote to\n" +- "interrupt a paragraph, even though the same considerations might\napply.\n\n" - "In CommonMark, we do allow lists to interrupt paragraphs, for\n" - "two reasons. First, it is natural and not uncommon for people\n" - "to start lists without blank lines:\n\n" -- "``` markdown\nI need to buy\n- new shoes\n- a coat\n" -- "- a plane ticket\n```\n\nSecond, we are attracted to a\n\n" +- "``` markdown\n" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket\n" +- "```\n\nSecond, we are attracted to a\n\n" - "> " -- "[principle of uniformity](@)" -- ":\n> if a chunk of text has a certain\n> " +- "[principle of uniformity](@):\n> " +- "if a chunk of text has a certain\n> " - "meaning, it will continue to have the same meaning when put into a\n> " - "container block (such as a list item or blockquote).\n\n" -- "(Indeed, the spec for [list items] and [block quotes] " -- "presupposes\nthis principle.) This principle implies that if\n\n" -- "``` markdown\n * I need to buy\n - new shoes\n - a coat\n" -- " - a plane ticket\n```\n\n" +- "(Indeed, the spec for [list items] and [block quotes]" +- " presupposes\nthis principle.) This principle implies that if\n\n" +- "``` markdown\n" +- " * I need to buy\n - new shoes\n - a coat\n - a plane ticket\n" +- "```\n\n" - "is a list item containing a paragraph followed by a nested sublist,\n" - "as all Markdown implementations agree it is (though the paragraph\n" - "may be rendered without `

    ` tags, since the list is \"" - "tight\"),\nthen\n\n" -- "``` markdown\nI need to buy\n- new shoes\n- a coat\n" -- "- a plane ticket\n```\n\n" +- "``` markdown\n" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket\n" +- "```\n\n" - "by itself should be a paragraph followed by a nested sublist.\n" - "\nSince it is well established Markdown practice to allow lists to\n" - "interrupt paragraphs inside list items, the [principle of\nuniformity]" @@ -4056,10 +4061,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[reStructuredText](https://" - "docutils.sourceforge.net/rst.html)\n" - "takes a different approach, requiring blank lines before lists\n" -- even inside other list items.) -- "\n\nIn order to solve the problem of unwanted lists in paragraphs with\n" -- "hard-wrapped numerals, we allow only lists starting with `1` " -- "to\ninterrupt paragraphs. Thus,\n\n" +- "even inside other list items.)\n\n" +- "In order to solve the problem of unwanted lists in paragraphs with\n" +- "hard-wrapped numerals, we allow only lists starting with `1`" +- " to\ninterrupt paragraphs. Thus,\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4176,9 +4181,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- e\n\n" - "````````````````" - "````````````````\n\n" -- "And here, `3. c` is treated as in indented code " -- "block,\nbecause it is indented four spaces and preceded by a\n" -- "blank line.\n\n" +- "And here, `3. c`" +- " is treated as in indented code block,\n" +- "because it is indented four spaces and preceded by a\nblank line.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4355,19 +4360,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">\n" - "````````````````" - "````````````````\n\n" -- "`hi` is parsed as code, leaving the backtick at the end " -- "as a literal\nbacktick.\n\n\n\n" +- "`hi`" +- " is parsed as code, leaving the backtick at the end as a literal\n" +- "backtick.\n\n\n\n" - "## Code spans\n\n" -- "A [backtick string](@)" -- "\nis a string of one or more backtick characters (`` ` ``" +- "A [backtick string](@)\n" +- "is a string of one or more backtick characters (`` ` ``" - ") that is neither\npreceded nor followed by a backtick.\n\n" -- "A [code span](@) begins with a backtick string and ends " -- "with\na backtick string of equal length. The contents of the code span are\n" +- "A [code span](@)" +- " begins with a backtick string and ends with\n" +- "a backtick string of equal length. The contents of the code span are\n" - "the characters between these two backtick strings, normalized in the\nfollowing ways:\n\n" - "- First, [line endings] are converted to [spaces].\n" -- "- If the resulting string both begins *and*" -- " ends with a [space]\n character, but does not consist entirely of [space" -- "]\n characters, a single [space] character is removed from the\n " +- "- If the resulting string both begins *and* ends with a [space]\n " +- "character, but does not consist entirely of [space]\n characters, a single [" +- "space] character is removed from the\n " - "front and back. This allows you to include code that begins\n " - "or ends with backtick characters, which must be separated by\n " - "whitespace from the opening or closing backtick strings.\n\n" @@ -4417,8 +4424,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a

    \n" - "````````````````" - "````````````````\n\n" -- "Only [spaces], and not [unicode whitespace] in general" -- ", are\nstripped in this way:\n\n" +- "Only [spaces], and not [unicode whitespace]" +- " in general, are\nstripped in this way:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4476,8 +4483,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Backslash escapes are never needed, because one can always choose a\nstring of " -- "*n*" -- " backtick characters as delimiters, where the code does\n" +- "*n* backtick characters as delimiters, where the code does\n" - "not contain any strings of exactly *n* backtick characters.\n\n" - "````````````````" - "```````````````` " @@ -4498,8 +4504,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Code span backticks have higher precedence than any other inline\n" - "constructs except HTML tags and autolinks. " - "Thus, for example, this is\n" -- "not parsed as emphasized text, since the second `*` is part of " -- "a code\nspan:\n\n" +- "not parsed as emphasized text, since the second `*`" +- " is part of a code\nspan:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4592,20 +4598,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[Markdown syntax\ndescription" - "](https://daringfireball.net/projects/markdown" - "/syntax#em) says:\n\n" -- "> Markdown treats asterisks (`*`) and " -- "underscores (`_`" -- ") as indicators of\n> emphasis. Text wrapped with one `*` or " -- "`_` will be wrapped with an HTML\n> " -- "`` tag; double `*`'s or `_" -- "`'s will be wrapped with an HTML ``" -- "\n> tag." -- "\n\nThis is enough for most users, but these rules leave much undecided,\n" +- "> " +- "Markdown treats asterisks (`*`" +- ") and underscores (`_`) as indicators of\n> " +- "emphasis. Text wrapped with one `*` or `_`" +- " will be wrapped with an HTML\n> ``" +- " tag; double `*`'s or `_`'" +- "s will be wrapped with an HTML ``\n> " +- "tag.\n\n" +- "This is enough for most users, but these rules leave much undecided,\n" - "especially when it comes to nested emphasis. The original\n" -- "`Markdown.pl` test suite makes it clear that triple `*" -- "**` and\n`___`" +- "`Markdown.pl` test suite makes it clear that triple " +- "`***` and\n`___`" - " delimiters can be used for strong emphasis, and most\n" - "implementations have also allowed the following patterns:\n\n" -- "``` markdown\n***strong emph***\n" +- "``` markdown\n" +- "***strong emph***\n" - "***strong** in emph*\n" - "***emph* in strong**\n" - "**in strong *emph***\n" @@ -4613,53 +4621,57 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The following patterns are less widely supported, but the intent\n" - "is clear and they are useful (especially in contexts like bibliography\n" - "entries):\n\n" -- "``` markdown\n*emph *with emph* in it*\n" +- "``` markdown\n" +- "*emph *with emph* in it*\n" - "**strong **with strong** in it**\n```\n\n" - "Many implementations have also restricted intraword emphasis to\nthe `*`" - " forms, to avoid unwanted emphasis in words containing\n" - "internal underscores. (It is best practice to put these in code\n" - "spans, but users often do not.)\n\n" -- "``` markdown\ninternal emphasis: foo*bar*baz\n" +- "``` markdown\n" +- "internal emphasis: foo*bar*baz\n" - "no emphasis: foo_bar_baz\n```\n\n" - "The rules given below capture all of these patterns, while allowing\n" -- for efficient parsing strategies that do not backtrack. -- "\n\nFirst, some definitions. A [delimiter run](@)" +- "for efficient parsing strategies that do not backtrack.\n\n" +- "First, some definitions. A [delimiter run](@)" - " is either\na sequence of one or more `*`" - " characters that is not preceded or\nfollowed by a non-backslash-escaped " - "`*` character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped " - "`_` character.\n\n" -- "A [left-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not followed by " -- "[Unicode whitespace],\n" -- "and either (2a) not followed by a [Unicode " -- "punctuation character], or\n(2b) followed by a [" -- "Unicode punctuation character] and\npreceded by [" -- "Unicode whitespace] or a [Unicode punctuation " -- "character].\nFor purposes of this definition, the beginning and the end of\n" +- "A [left-flanking delimiter run](@) is\na " +- "[delimiter run] that is (1) not followed by [" +- "Unicode whitespace],\n" +- "and either (2a) not followed by a [" +- "Unicode punctuation character], or\n" +- "(2b) followed by a [Unicode punctuation character" +- "] and\npreceded by [Unicode whitespace] or a [" +- "Unicode punctuation character].\n" +- "For purposes of this definition, the beginning and the end of\n" - "the line count as Unicode whitespace.\n\n" -- "A [right-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not preceded by " -- "[Unicode whitespace],\n" -- "and either (2a) not preceded by a [Unicode " -- "punctuation character], or\n(2b) preceded by a [" -- "Unicode punctuation character] and\nfollowed by [" -- "Unicode whitespace] or a [Unicode punctuation " -- "character].\nFor purposes of this definition, the beginning and the end of\n" +- "A [right-flanking delimiter run](@) is\na " +- "[delimiter run] that is (1) not preceded by [" +- "Unicode whitespace],\n" +- "and either (2a) not preceded by a [" +- "Unicode punctuation character], or\n" +- "(2b) preceded by a [Unicode punctuation character" +- "] and\nfollowed by [Unicode whitespace] or a [" +- "Unicode punctuation character].\n" +- "For purposes of this definition, the beginning and the end of\n" - "the line count as Unicode whitespace.\n\n" - "Here are some examples of delimiter runs.\n\n" -- " - left-flanking but not right-flanking:\n\n ```\n" -- " ***abc\n _abc\n " +- " - left-flanking but not right-flanking:\n" +- "\n ```\n ***abc\n _abc\n " - "**\"abc\"\n _\"abc\"\n ```\n\n" -- " - right-flanking but not left-flanking:\n\n ```\n" -- " abc***\n abc_\n " +- " - right-flanking but not left-flanking:\n" +- "\n ```\n abc***\n abc_\n " - "\"abc\"**\n \"abc\"_\n ```\n\n" -- " - Both left and right-flanking:\n\n ```\n" -- " abc***def\n \"abc\"_\"def\"\n" -- " ```\n\n" -- " - Neither left nor right-flanking:\n\n ```\n" -- " abc *** def\n a _ b\n ```\n\n" -- "(The idea of distinguishing left-flanking and right-flanking\n" +- " - Both left and right-flanking:\n" +- "\n ```\n abc***def\n " +- "\"abc\"_\"def\"\n ```\n\n" +- " - Neither left nor right-flanking:\n" +- "\n ```\n abc *** def\n a _ b\n ```" +- "\n\n(The idea of distinguishing left-flanking and right-flanking\n" - "delimiter runs based on the character before and the character\n" - "after comes from Roopesh Chander's\n" - "[vfmd](https://web.archive.org" @@ -4672,64 +4684,62 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "are a bit more complex than the ones given here.)\n\n" - "The following rules define emphasis and strong emphasis:\n\n" - "1. " -- "A single `*` character [can open emphasis](@)" -- "\n iff (if and only if) it is part of a [" +- "A single `*` character [can open emphasis](@)\n " +- "iff (if and only if) it is part of a [" - "left-flanking delimiter run].\n\n" - "2. " -- "A single `_`" -- " character [can open emphasis] iff\n it is part of a [" -- "left-flanking delimiter run]\n " -- "and either (a) not part of a [right-flanking " -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "A single `_` character [can open emphasis] iff\n " +- "it is part of a [left-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a " +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character].\n\n" - "3. " -- "A single `*` character [can close emphasis](@)" -- "\n iff it is part of a [right-flanking delimiter run" +- "A single `*` character [can close emphasis](@)\n " +- "iff it is part of a [right-flanking delimiter run" - "].\n\n" - "4. " -- "A single `_`" -- " character [can close emphasis] iff\n it is part of a [" -- "right-flanking delimiter run]\n " -- "and either (a) not part of a [left-flanking " -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" +- "A single `_` character [can close emphasis] iff\n " +- "it is part of a [right-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a " +- "[left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character].\n\n" - "5. " -- "A double `**` [can open strong emphasis](@)" -- "\n iff it is part of a [left-flanking delimiter run" +- "A double `**` [can open strong emphasis](@)\n " +- "iff it is part of a [left-flanking delimiter run" - "].\n\n" - "6. " -- "A double `__`" -- " [can open strong emphasis] iff\n it is part of a [" -- "left-flanking delimiter run]\n " -- "and either (a) not part of a [right-flanking " -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "A double `__` [can open strong emphasis] iff\n " +- "it is part of a [left-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a " +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character].\n\n" - "7. " -- "A double `**` [can close strong emphasis](@)" -- "\n iff it is part of a [right-flanking delimiter run" +- "A double `**` [can close strong emphasis](@)\n " +- "iff it is part of a [right-flanking delimiter run" - "].\n\n" - "8. " -- "A double `__`" -- " [can close strong emphasis] iff\n it is part of a [" -- "right-flanking delimiter run]\n " -- "and either (a) not part of a [left-flanking " -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" -- "Unicode punctuation character]." -- "\n\n9. Emphasis begins with a delimiter that [can open emphasis" -- "] and ends\n with a delimiter that [can close emphasis]" +- "A double `__` [can close strong emphasis] iff\n " +- "it is part of a [right-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a " +- "[left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character].\n\n" +- "9. " +- "Emphasis begins with a delimiter that [can open emphasis]" +- " and ends\n with a delimiter that [can close emphasis]" - ", and that uses the same\n character (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs" - "]. If one of the delimiters can both\n " - "open and close emphasis, then the sum of the lengths of the\n " - "delimiter runs containing the opening and closing delimiters\n " -- "must not be a multiple of 3 unless both lengths are\n multiples of 3." -- "\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis" -- "] and ends with a delimiter that\n [can close strong emphasis]" +- "must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n" +- "10. " +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis]" +- " and ends with a delimiter that\n [can close strong emphasis]" - ", and that uses the same character\n (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs" @@ -4740,41 +4750,46 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "are multiples of 3.\n\n" - "11. " - "A literal `*` character cannot occur at the beginning or end of\n " -- "`*`-delimited emphasis or `**`-" -- "delimited strong emphasis, unless it\n is backslash-escaped." -- "\n\n12. " +- "`*`-delimited emphasis or `**`" +- "-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" +- "12. " - "A literal `_` character cannot occur at the beginning or end of\n " -- "`_`-delimited emphasis or `__`-" -- "delimited strong emphasis, unless it\n is backslash-escaped.\n\n" +- "`_`-delimited emphasis or `__`" +- "-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "Where rules 1--12 above are compatible with multiple parsings,\n" -- "the following principles resolve ambiguity:" -- "\n\n13. " +- "the following principles resolve ambiguity:\n\n" +- "13. " - "The number of nestings should be minimized. Thus, for example,\n " -- "an interpretation `...` is always " -- "preferred to\n " +- "an interpretation `...`" +- " is always preferred to\n " - "`...`.\n\n" - "14. " -- "An interpretation `..." -- "` is always\n preferred to " +- "An interpretation " +- "`...` is always\n preferred to " - "`...`." -- "\n\n15. When two potential emphasis or strong emphasis spans overlap,\n " +- "strong>`.\n\n" +- "15. " +- "When two potential emphasis or strong emphasis spans overlap,\n " - "so that the second begins before the first ends and ends after\n " - "the first ends, the first takes precedence. Thus, for example,\n " -- "`*foo _bar* baz_` is parsed as `" -- "foo _bar baz_` rather\n " -- "than `*foo bar* baz" -- "`." -- "\n\n16. When there are two potential emphasis or strong emphasis spans\n " +- "`*foo _bar* baz_` is parsed as " +- "`foo _bar baz_`" +- " rather\n than " +- "`*foo bar* baz`" +- ".\n\n" +- "16. " +- "When there are two potential emphasis or strong emphasis spans\n " - "with the same closing delimiter, the shorter one (the one that\n " - "opens later) takes precedence. Thus, for example,\n " -- "`**foo **bar baz**` is parsed " -- "as `**foo bar baz" -- "`\n rather than " +- "`**foo **bar baz**`" +- " is parsed as " +- "`**foo bar baz`\n " +- "rather than " - "`foo **bar baz`" -- "." -- "\n\n17. " +- ".\n\n" +- "17. " - "Inline code spans, links, images, and HTML tags group more " - "tightly\n than emphasis. So, when there is a choice between an interpretation\n " - "that contains one of these elements and one that does not, the\n " @@ -4793,8 +4808,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">\n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `*`" -- " is followed by\nwhitespace, and hence not part of a [" +- "This is not emphasis, because the opening `*` is followed by\n" +- "whitespace, and hence not part of a [" - "left-flanking delimiter run]:\n\n" - "````````````````" - "```````````````` " @@ -4803,8 +4818,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a * foo bar*

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `*`" -- " is preceded\n" +- "This is not emphasis, because the opening `*` is preceded\n" - "by an alphanumeric and followed by punctuation, and hence\n" - "not part of a [left-flanking delimiter run]:\n\n" - "````````````````" @@ -4862,8 +4876,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">\n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `_`" -- " is followed by\nwhitespace:\n\n" +- "This is not emphasis, because the opening `_` is followed by\n" +- "whitespace:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4871,8 +4885,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _ foo bar_

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the opening `_`" -- " is preceded\nby an alphanumeric and followed by punctuation:\n\n" +- "This is not emphasis, because the opening `_` is preceded\n" +- "by an alphanumeric and followed by punctuation:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4907,8 +4921,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "тся_

    \n" - "````````````````" - "````````````````\n\n\n" -- "Here `_`" -- " does not generate emphasis, because the first delimiter run\n" +- "Here `_` does not generate emphasis, because the first delimiter run\n" - "is right-flanking and the second left-flanking:\n\n" - "````````````````" - "```````````````` " @@ -4938,8 +4951,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "_foo*\n.\n

    _foo*

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the closing `*`" -- " is preceded by\nwhitespace:\n\n" +- "This is not emphasis, because the closing `*` is preceded by\n" +- "whitespace:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4956,10 +4969,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the second `*`" -- " is\npreceded by punctuation and followed by an alphanumeric\n" -- "(hence it is not part of a [right-flanking delimiter " -- "run]:\n\n" +- "This is not emphasis, because the second `*` is\n" +- "preceded by punctuation and followed by an alphanumeric\n" +- "(hence it is not part of a [" +- "right-flanking delimiter run]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4988,8 +5001,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "Rule 4:\n" -- "\nThis is not emphasis, because the closing `_`" -- " is preceded by\nwhitespace:\n\n" +- "\nThis is not emphasis, because the closing `_` is preceded by\n" +- "whitespace:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4997,8 +5010,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _foo bar _

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not emphasis, because the second `_`" -- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" +- "This is not emphasis, because the second `_` is\n" +- "preceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5073,8 +5086,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ** foo bar**

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not strong emphasis, because the opening `**`" -- " is preceded\n" +- "This is not strong emphasis, because the opening `**` is preceded\n" - "by an alphanumeric and followed by punctuation, and hence\n" - "not part of a [left-flanking delimiter run]:\n\n" - "````````````````" @@ -5122,8 +5134,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo bar__

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not strong emphasis, because the opening `__`" -- " is preceded\nby an alphanumeric and followed by punctuation:\n\n" +- "This is not strong emphasis, because the opening `__` is preceded\n" +- "by an alphanumeric and followed by punctuation:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5189,8 +5201,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "(Nor can it be interpreted as an emphasized `*foo bar *`" - ", because of\nRule 11.)\n\n" -- "This is not strong emphasis, because the second `**`" -- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" +- "This is not strong emphasis, because the second `**` is\n" +- "preceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5250,8 +5262,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    __foo bar __

    \n" - "````````````````" - "````````````````\n\n\n" -- "This is not strong emphasis, because the second `__`" -- " is\npreceded by punctuation and followed by an alphanumeric:\n\n" +- "This is not strong emphasis, because the second `__` is\n" +- "preceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5380,7 +5392,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n``` markdown\n" - "

    foobar" - "baz\n```\n\n\nis precluded by the condition that a delimiter that\n" +- ">\n```\n\n\n" +- "is precluded by the condition that a delimiter that\n" - "can both open and close (like the `*` after `foo`" - ")\ncannot form emphasis if the sum of the lengths of\n" - "the delimiter runs containing the opening and\n" @@ -5423,8 +5436,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "When the lengths of the interior closing and opening\ndelimiter runs are " -- "*both*" -- " multiples of 3, though,\nthey can match to create emphasis:\n\n" +- "*both* multiples of 3, though,\n" +- "they can match to create emphasis:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5654,8 +5667,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Note that when delimiters do not match evenly, Rule 11 determines\n" -- "that the excess literal `*`" -- " characters will appear outside of the\nemphasis, rather than inside it:\n\n" +- "that the excess literal `*` characters will appear outside of the\n" +- "emphasis, rather than inside it:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5761,8 +5774,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Note that when delimiters do not match evenly, Rule 12 determines\n" -- "that the excess literal `_`" -- " characters will appear outside of the\nemphasis, rather than inside it:\n\n" +- "that the excess literal `_` characters will appear outside of the\n" +- "emphasis, rather than inside it:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -6000,76 +6013,81 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## Links\n\n" -- "A link contains [link text] (the visible text), a [link " -- "destination]\n" +- "A link contains [link text] (the visible text), a [" +- "link destination]\n" - "(the URI that is the link destination), and optionally a [" - "link title].\nThere are two basic kinds of links in Markdown. In " - "[inline links] the\n" - "destination and title are given immediately after the link text. In\n[reference links]" - " the destination and title are defined elsewhere in\nthe document.\n\n" -- "A [link text](@)" -- " consists of a sequence of zero or more\ninline elements enclosed by square brackets (" -- "`[` and `]`). The\nfollowing rules apply:\n\n" -- "- Links may not contain other links, at any level of nesting. If\n " +- "A [link text](@) consists of a sequence of zero or more\n" +- "inline elements enclosed by square brackets (`[` and `]`" +- "). The\nfollowing rules apply:\n\n" +- "- " +- "Links may not contain other links, at any level of nesting. If\n " - "multiple otherwise valid link definitions appear nested inside each\n " -- "other, the inner-most definition is used." -- "\n\n- Brackets are allowed in the [link text]" +- "other, the inner-most definition is used.\n\n" +- "- " +- "Brackets are allowed in the [link text]" - " only if (a) they\n " - "are backslash-escaped or (b) they appear as a matched pair of " - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket " - "`]`.\n\n" -- "- Backtick [code spans], [autolinks], and raw " -- "[HTML tags] bind more tightly\n " +- "- " +- "Backtick [code spans], [autolinks], and raw [" +- "HTML tags] bind more tightly\n " - "than the brackets in link text. Thus, for example,\n " -- "`` [foo`]` `` could not be a link text" -- ", since the second `]`\n is part of a code span.\n\n" -- "- The brackets in link text bind more tightly than markers for\n [" -- "emphasis and strong emphasis]. Thus, for example, " +- "`` [foo`]` ``" +- " could not be a link text, since the second `]`\n " +- "is part of a code span.\n\n" +- "- " +- "The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis" +- "]. Thus, for example, " - "`*[foo*](url)` is a link.\n\n" - "A [link destination](@) consists of either\n\n" - "- " - "a sequence of zero or more characters between an opening `<` and a\n closing " - "`>` that contains no line endings or unescaped\n `<` or " - "`>` characters, or\n\n" -- "- a nonempty sequence of characters that does not start with `<`" -- ",\n does not include [ASCII control characters][" -- "ASCII control character]\n or [space]" +- "- " +- "a nonempty sequence of characters that does not start with `<`,\n " +- "does not include [ASCII control characters][ASCII control character" +- "]\n or [space]" - " character, and includes parentheses only if (a) they are\n " - "backslash-escaped or (b) they are part of a balanced pair of\n " - "unescaped parentheses.\n " - "(Implementations may impose limits on parentheses nesting to\n " -- "avoid performance issues, but at least three levels of nesting\n should be supported.)" -- "\n\nA [link title](@) consists of either\n\n" +- "avoid performance issues, but at least three levels of nesting\n should be supported.)\n\n" +- "A [link title](@) consists of either\n\n" - "- " - "a sequence of zero or more characters between straight double-quote\n characters (" -- "`\"`), including a `\"`" -- " character only if it is\n backslash-escaped, or" -- "\n\n- " +- "`\"`), including a `\"` character only if it is\n " +- "backslash-escaped, or\n\n" +- "- " - "a sequence of zero or more characters between straight single-quote\n characters (" -- "`'`), including a `'`" -- " character only if it is\n backslash-escaped, or" -- "\n\n- " +- "`'`), including a `'` character only if it is\n " +- "backslash-escaped, or\n\n" +- "- " - "a sequence of zero or more characters between matching parentheses\n (" -- "`(...)`), including a `(` or `" -- ")` character only if it is\n backslash-escaped.\n\n" +- "`(...)`), including a `(` or " +- "`)` character only if it is\n backslash-escaped.\n\n" - "Although [link titles] may span multiple lines, they may not contain\na [" - "blank line].\n\n" -- "An [inline link](@) consists of a [link text] " -- "followed immediately\nby a left parenthesis `(`" -- ", an optional [link destination], an optional\n[link title]" -- ", and a right parenthesis `)`" -- ".\n" +- "An [inline link](@) consists of a [link text]" +- " followed immediately\nby a left parenthesis `(`, an optional [link destination" +- "], an optional\n[link title], and a right parenthesis " +- "`)`.\n" - "These four components may be separated by spaces, tabs, and up to one " - "line\nending.\nIf both [link destination] and [link title]" -- " are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending." -- "\n\nThe link's text consists of the inlines contained\nin the [link text" +- " are present, they *must* be\n" +- "separated by spaces, tabs, and up to one line ending.\n\n" +- "The link's text consists of the inlines contained\nin the [link text" - "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n" -- "`<...>` if present, with backslash-escapes in " -- "effect as described\nabove. The link'" -- "s title consists of the link title, excluding its\n" +- "`<...>`" +- " if present, with backslash-escapes in effect as described\nabove. The link" +- "'s title consists of the link title, excluding its\n" - "enclosing delimiters, with backslash-escapes in effect " - "as described\nabove.\n\nHere is a simple inline link:\n" - "\n" @@ -6364,20 +6382,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "link

    \n" - "````````````````" - "````````````````\n\n\n" -- "(Note: `Markdown.pl` did allow double quotes inside a " -- "double-quoted\ntitle, and its test suite included a test demonstrating this.\n" +- "(Note: `Markdown.pl`" +- " did allow double quotes inside a double-quoted\n" +- "title, and its test suite included a test demonstrating this.\n" - "But it is hard to see a good rationale for the extra complexity this\n" - "brings, since there are already many ways---backslash escaping,\n" - "entity and numeric character references, or using a different\n" - "quote type for the enclosing title---to write titles containing\n" -- "double quotes. `Markdown.pl`" -- "'s handling of titles has a number\n" +- "double quotes. `Markdown.pl`'" +- "s handling of titles has a number\n" - "of other strange features. For example, it allows single-quoted\n" - "titles in inline links, but not reference links. And, in\n" - "reference links but not inline links, it allows a title to begin\nwith " - "`\"` and end with `)`. " -- "`Markdown.pl`" -- " 1.0.1 even allows\n" +- "`Markdown.pl` 1.0.1 even allows\n" - "titles with no closing quotation mark, though 1.0.2b8 " - "does not.\nIt seems preferable to adopt a simple, rational rule that works\n" - "the same way in inline links and link reference definitions.)\n\n" @@ -6509,8 +6527,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo *bar

    \n" - "````````````````" - "````````````````\n\n\n" -- "Note that brackets that *aren't*" -- " part of links do not take\nprecedence:\n\n" +- "Note that brackets that *aren't* part of links do not take\n" +- "precedence:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -6551,25 +6569,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "There are three kinds of [reference link](@)s:\n" -- "[full](#full-reference-link), [collapsed](" -- "#collapsed-reference-link),\nand " +- "[full](#full-reference-link), " +- "[collapsed](#collapsed-reference-link),\nand " - "[shortcut](#shortcut-reference-link).\n\n" -- "A [full reference link](@)" -- "\nconsists of a [link text] immediately followed by a [link label]\nthat " -- "[matches] a [link reference definition] elsewhere in the document.\n\n" -- "A [link label](@) begins with a left bracket (`[" -- "`) and ends\nwith the first right bracket (`]`" +- "A [full reference link](@)\nconsists of a [link text]" +- " immediately followed by a [link label]\nthat [matches] a [" +- "link reference definition] elsewhere in the document.\n\n" +- "A [link label](@) begins with a left bracket (" +- "`[`) and ends\nwith the first right bracket (`]`" - ") that is not backslash-escaped.\n" - "Between these brackets there must be at least one character that is not a space,\n" - "tab, or line ending.\n" - "Unescaped square bracket characters are not allowed inside the\n" - "opening and closing square brackets of [link labels]. A link\n" - "label can have at most 999 characters inside the square\nbrackets.\n\n" -- "One label [matches](@)" -- "\nanother just in case their normalized forms are equal. To normalize a\n" +- "One label [matches](@)\n" +- "another just in case their normalized forms are equal. To normalize a\n" - "label, strip off the opening and closing brackets,\nperform the " -- "*Unicode case fold*" -- ", strip leading and trailing\n" +- "*Unicode case fold*, strip leading and trailing\n" - "spaces, tabs, and line endings, and collapse consecutive internal\n" - "spaces, tabs, and line endings to a single space. " - "If there are multiple\nmatching reference link definitions, the one that comes first in the\n" @@ -6774,12 +6791,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]. If whitespace is allowed between the\n" - "link text and the link label, then in the following we will have\n" - "a single reference link, not two shortcut reference links, as\nintended:\n\n" -- "``` markdown\n[foo]\n[bar]\n\n" -- "[foo]: /url1\n" -- "[bar]: /url2\n```\n\n(Note that [" -- "shortcut reference links] were introduced by Gruber\n" -- "himself in a beta version of `Markdown.pl`, but never " -- "included\nin the official syntax description. Without shortcut reference\n" +- "``` markdown\n" +- "[foo]\n[bar]\n\n[foo]: /url1\n" +- "[bar]: /url2\n```\n\n" +- "(Note that [shortcut reference links] were introduced by Gruber\n" +- "himself in a beta version of `Markdown.pl`" +- ", but never included\nin the official syntax description. Without shortcut reference\n" - "links, it is harmless to allow space between the link text and\n" - "link label; but once shortcut references are introduced, it is\n" - "too dangerous to allow this, as it frequently leads to\n" @@ -6858,8 +6875,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\\

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:\n\n" +- "A [link label]" +- " must contain at least one character that is not a space, tab, or\n" +- "line ending:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -6876,14 +6894,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]: /uri

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [collapsed reference link](@)" -- "\nconsists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument, followed by the string `[]`" -- ".\nThe contents of the link label are parsed as inlines,\n" +- "A [collapsed reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the\n" +- "document, followed by the string `[]`.\n" +- "The contents of the link label are parsed as inlines,\n" - "which are used as the link's text. The link'" - "s URI and title are\nprovided by the matching reference link definition. Thus,\n" -- "`[foo][]` is equivalent to `[foo]" -- "[foo]`.\n\n" +- "`[foo][]` is equivalent to " +- "`[foo][foo]`.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -6925,15 +6943,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo\n[]

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [shortcut reference link](@)" -- "\nconsists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument and is not followed by `[]`" -- " or a link label.\n" +- "A [shortcut reference link](@)\nconsists of a [link label" +- "] that [matches] a\n[link reference definition] elsewhere in the\n" +- "document and is not followed by `[]` or a link label.\n" - "The contents of the link label are parsed as inlines,\n" - "which are used as the link's text. The link'" - "s URI and title\nare provided by the matching link reference definition.\nThus, " -- "`[foo]` is equivalent to `[foo][]" -- "`.\n\n" +- "`[foo]` is equivalent to " +- "`[foo][]`.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7047,8 +7064,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo(not a link)

    \n" - "````````````````" - "````````````````\n\n" -- "In the following case `[bar][baz]` is parsed " -- "as a reference,\n`[foo]` as normal text:\n\n" +- "In the following case `[bar][baz]`" +- " is parsed as a reference,\n`[foo]`" +- " as normal text:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7058,8 +7076,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url\">bar

    \n" - "````````````````" - "````````````````\n\n\n" -- "Here, though, `[foo][bar]` is parsed " -- "as a reference, since\n`[bar]` is defined:\n\n" +- "Here, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7071,9 +7089,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url1\">baz

    \n" - "````````````````" - "````````````````\n\n\n" -- "Here `[foo]` is not parsed as a shortcut reference" -- ", because it\nis followed by a link label (even though " -- "`[bar]` is not defined):\n\n" +- "Here `[foo]`" +- " is not parsed as a shortcut reference, because it\n" +- "is followed by a link label (even though `[bar]`" +- " is not defined):\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7087,11 +7106,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Images\n\n" - "Syntax for images is like the syntax for links, with one\n" - "difference. Instead of [link text], we have an\n" -- "[image description](@)" -- ". The rules for this are the\nsame as for [link text]" -- ", except that (a) an\nimage description starts with `![`" -- " rather than `[`" -- ", and\n(b) an image description may contain links.\n" +- "[image description](@). The rules for this are the\n" +- "same as for [link text], except that (a) an\n" +- "image description starts with `![` rather than `[`, and\n" +- "(b) an image description may contain links.\n" - "An image description has inline elements\n" - "as its contents. When an image is rendered to HTML,\n" - "this is standardly used as the image's `alt` attribute.\n\n" @@ -7136,8 +7154,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Though this spec is concerned with parsing, not rendering, it is\n" - "recommended that in rendering to HTML, only the plain string content\nof the " - "[image description] be used. Note that in\n" -- "the above example, the alt attribute's value is `foo bar" -- "`, not `foo\n[bar](/url)` or " +- "the above example, the alt attribute's value is " +- "`foo bar`, not " +- "`foo\n[bar](/url)` or " - "`foo bar" - "`. Only the plain string\n" - "content is rendered, without formatting.\n\n" @@ -7328,28 +7347,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Autolinks\n\n" -- "[Autolink](@)s are absolute URIs and email addresses " -- "inside\n`<` and `>`" +- "[Autolink](@)" +- "s are absolute URIs and email addresses inside\n`<` and " +- "`>`" - ". They are parsed as links, with the URL or email address\n" - "as the link label.\n\n" -- "A [URI autolink](@) consists of `<`, " -- "followed by an\n[absolute URI] followed by `>`" +- "A [URI autolink](@) consists of `<`" +- ", followed by an\n[absolute URI] followed by `>`" - ". It is parsed as\n" -- "a link to the URI, with the URI as the link's " -- "label.\n\n" -- "An [absolute URI](@)" -- ",\nfor these purposes, consists of a [scheme]" -- " followed by a colon (`:`" -- ")\nfollowed by zero or more characters other than [ASCII control\ncharacters]" -- "[ASCII control character], [space], `<`, " -- "and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded\n" +- "a link to the URI, with the URI as the link'" +- "s label.\n\n" +- "An [absolute URI](@),\n" +- "for these purposes, consists of a [scheme] followed by a colon (" +- "`:`)\nfollowed by zero or more characters other than [" +- "ASCII control\ncharacters][ASCII control character], [space" +- "], `<`, and `>`.\n" +- "If the URI includes these characters, they must be percent-encoded\n" - "(e.g. `%20` for a space).\n\n" -- "For purposes of this spec, a [scheme](@) is any " -- "sequence\nof 2--32 characters beginning with an ASCII letter and followed\n" +- "For purposes of this spec, a [scheme](@)" +- " is any sequence\nof 2--" +- "32 characters beginning with an ASCII letter and followed\n" - "by any combination of ASCII letters, digits, or the symbols plus\n(" -- "\"+\"), period (\".\"), or " -- "hyphen (\"-\").\n\n" +- "\"+\"), period (\".\"" +- "), or hyphen (\"-\").\n\n" - "Here are some valid autolinks:\n" - "\n" - "````````````````" @@ -7463,8 +7483,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is " - "`mailto:` followed by the email address.\n\n" -- "An [email address](@)" -- ",\nfor these purposes, is anything that matches\nthe " +- "An [email address](@),\n" +- "for these purposes, is anything that matches\nthe " - "[non-normative regex from the HTML5\nspec" - "](https://html.spec.whatwg.org" - "/multipage/forms.html#e-mail-state-(type" @@ -7558,61 +7578,61 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Raw HTML\n\n" -- "Text between `<` and `>` that looks like an HTML " -- "tag is parsed as a\n" +- "Text between `<` and `>`" +- " that looks like an HTML tag is parsed as a\n" - "raw HTML tag and will be rendered in HTML without escaping.\n" - "Tag and attribute names are not limited to current HTML tags,\n" - "so custom tags (and even, say, DocBook tags) may be " - "used.\n\nHere is the grammar for tags:\n" -- "\nA [tag name](@)" -- " consists of an ASCII letter\n" +- "\nA [tag name](@) consists of an ASCII letter\n" - "followed by zero or more ASCII letters, digits, or\n" - "hyphens (`-`).\n\n" -- "An [attribute](@) consists of spaces, tabs, and up " -- "to one line ending,\nan [attribute name], and an optional\n[" -- "attribute value specification].\n\n" +- "An [attribute](@)" +- " consists of spaces, tabs, and up to one line ending,\nan [" +- "attribute name], and an optional\n[attribute value specification].\n\n" - "An [attribute name](@)\nconsists of an ASCII letter, " -- "`_`, or `:`, followed by zero or more " -- "ASCII\nletters, digits, `_`, `.`, " -- "`:`, or `-`" +- "`_`, or `:`" +- ", followed by zero or more ASCII\nletters, digits, `_`" +- ", `.`, `:`, or `-`" - ". (Note: This is the XML\n" - "specification restricted to ASCII. " - "HTML5 is laxer.)\n\n" -- "An [attribute value specification](@)" -- "\nconsists of optional spaces, tabs, and up to one line ending,\na " -- "`=` character, optional spaces, tabs, and up to one line " -- "ending,\nand an [attribute value].\n\n" -- "An [attribute value](@)" -- "\nconsists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value]" -- ".\n\n" -- "An [unquoted attribute value](@)" -- "\nis a nonempty string of characters not\n" +- "An [attribute value specification](@)\n" +- "consists of optional spaces, tabs, and up to one line ending,\na " +- "`=`" +- " character, optional spaces, tabs, and up to one line ending,\n" +- "and an [attribute value].\n\n" +- "An [attribute value](@)\nconsists of an [" +- "unquoted attribute value],\na [single-quoted attribute value]" +- ", or a [double-quoted attribute value].\n\n" +- "An [unquoted attribute value](@)\n" +- "is a nonempty string of characters not\n" - "including spaces, tabs, line endings, `\"`, `'`" -- ", `=`, `<`, `>`, or `` " -- "` ``.\n\n" +- ", `=`, `<`, `>`, or " +- "`` ` ``.\n\n" - "A [single-quoted attribute value](@)\nconsists of `'`" - ", zero or more\ncharacters not including `'`, and a final " - "`'`.\n\n" - "A [double-quoted attribute value](@)\nconsists of `\"`" - ", zero or more\ncharacters not including `\"`, and a final " - "`\"`.\n\n" -- "An [open tag](@) consists of a `<` character, " -- "a [tag name],\nzero or more [attributes]" +- "An [open tag](@) consists of a `<`" +- " character, a [tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional " - "`/` character, and a `>` character.\n\n" - "A [closing tag](@) consists of the string ``.\n\n" -- "An [HTML comment](@) consists of ``, ``, or `<" -- "!--`, a string of\ncharacters not including the string " -- "`-->`, and `-->` (see the\n" +- "An [HTML comment](@) consists of " +- "``, ``" +- ", or ``, and " +- "`-->` (see the\n" - "[HTML spec](https://" - html.spec.whatwg.org/multipage/ -- "parsing.html#markup-declaration-open-state))." -- "\n\nA [processing instruction](@)\nconsists of the string ``" - ", and the string\n`?>`.\n\n" - "A [declaration](@) consists of the string ``" -- " tag):\n\n" +- "is parsed as a [hard line break](@) (rendered\n" +- "in HTML as a `
    ` tag):\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -8021,7 +8040,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "In this appendix we describe some features of the parsing strategy\n" - "used in the CommonMark reference implementations.\n\n" - "## Overview\n\nParsing has two phases:\n\n" -- "1. In the first phase, lines of input are consumed and the block\n" +- "1. " +- "In the first phase, lines of input are consumed and the block\n" - "structure of the document---its division into paragraphs, block quotes,\n" - "list items, and so on---" - "is constructed. Text is assigned to these\n" @@ -8031,21 +8051,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "In the second phase, the raw text contents of paragraphs and headings\n" - "are parsed into sequences of Markdown inline elements (strings,\n" - "code spans, links, emphasis, and so on), using the map of " -- "link\nreferences constructed in phase 1." -- "\n\nAt each point in processing, the document is represented as a tree of\n" +- "link\nreferences constructed in phase 1.\n\n" +- "At each point in processing, the document is represented as a tree of\n" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as " -- "**children**" -- ". These children\n" +- "**children**. These children\n" - "may, in turn, have other blocks as children. " - "The last child of a block\nis normally considered **open**" - ", meaning that subsequent lines of input\n" - "can alter its contents. (Blocks that are not open are " -- "**closed**" -- ".)\n" +- "**closed**.)\n" - "Here, for example, is a possible document tree, with the open blocks\n" - "marked by arrows:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph\n" @@ -8056,33 +8075,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Each line that is processed has an effect on this tree. The line is\n" - "analyzed and, depending on its contents, the document may be altered\n" - "in one or more of the following ways:\n\n" -- 1. One or more open blocks may be closed. -- "\n2. One or more new blocks may be created as children of the\n " +- "1. One or more open blocks may be closed.\n" +- "2. One or more new blocks may be created as children of the\n " - "last open block.\n" - "3. Text may be added to the last (deepest) open block remaining\n " - "on the tree.\n\n" - "Once a line has been incorporated into the tree in this way,\n" - "it can be discarded, so input can be read in a stream.\n\n" - "For each line, we follow this procedure:\n\n" -- "1. First we iterate through the open blocks, starting with the\n" +- "1. " +- "First we iterate through the open blocks, starting with the\n" - "root document, and descending through last children down to the last\n" - "open block. Each block imposes a condition that the line must satisfy\n" - "if the block is to remain open. For example, a block quote requires a\n" -- "`>`" -- " character. A paragraph requires a non-blank line.\n" +- "`>` character. A paragraph requires a non-blank line.\n" - "In this phase we may match all or just some of the open\n" - "blocks. " - "But we cannot close unmatched blocks yet, because we may have a\n[" -- "lazy continuation line]." -- "\n\n2. Next, after consuming the continuation markers for existing\n" -- "blocks, we look for new block starts (e.g. `>` " -- "for a block quote).\n" +- "lazy continuation line].\n\n" +- "2. " +- "Next, after consuming the continuation markers for existing\n" +- "blocks, we look for new block starts (e.g. `>`" +- " for a block quote).\n" - "If we encounter a new block start, we close any blocks unmatched\n" - "in step 1 before creating the new block as a child of the last\n" -- matched container block. -- "\n\n3. Finally, we look at the remainder of the line (after block\n" -- "markers like `>`, list markers, and indentation have been consumed" -- ").\nThis is text that can be incorporated into the last open\n" +- "matched container block.\n\n" +- "3. " +- "Finally, we look at the remainder of the line (after block\nmarkers like " +- "`>`, list markers, and indentation have been consumed).\n" +- "This is text that can be incorporated into the last open\n" - "block (a paragraph, code block, heading, or raw HTML)" - ".\n\n" - "Setext headings are formed when we see a line of a paragraph\n" @@ -8092,37 +8113,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one or more reference link definitions. Any remainder becomes a\nnormal paragraph.\n\n" - "We can see how this works by considering how the tree above is\n" - "generated by four lines of Markdown:\n\n" -- "``` markdown\n> Lorem ipsum dolor\n" -- "sit amet.\n" +- "``` markdown\n" +- "> Lorem ipsum dolor\nsit amet.\n" - "> - Qui *quodsi iracundia*\n" - "> - aliquando id\n```\n\n" - "At the outset, our document model is just\n" - "\n``` tree\n-> document\n```" - "\n\nThe first line of our text,\n" - "\n``` markdown\n> Lorem ipsum dolor\n```" -- "\n\ncauses a `block_quote`" -- " block to be created as a child of our\nopen `document`" -- " block, and a `paragraph` block as a child of\nthe " -- "`block_quote`" -- ". Then the text is added to the last open\nblock, the `paragraph`" -- ":\n\n" -- "``` tree\n-> document\n -> block_quote\n -> paragraph\n" +- "\n\ncauses a `block_quote` block to be created as a child of our\n" +- "open `document` block, and a `paragraph` block as a child of\n" +- "the `block_quote`. Then the text is added to the last open\n" +- "block, the `paragraph`:\n\n" +- "``` tree\n" +- "-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\"\n```\n\nThe next line,\n" - "\n``` markdown\nsit amet.\n```" - "\n\nis a \"lazy continuation\" of the open `paragraph`" - ", so it gets added\nto the paragraph's text:\n\n" -- "``` tree\n-> document\n -> block_quote\n -> paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - "```\n\nThe third line,\n" - "\n``` markdown\n" - "> - Qui *quodsi iracundia*\n" - "```\n\n" -- "causes the `paragraph` block to be closed, and a new `list` " -- "block\nopened as a child of the `block_quote`. A " +- "causes the `paragraph` block to be closed, and a new `list`" +- " block\nopened as a child of the `block_quote`. A " - "`list_item` is also\nadded as a child of the `list`" - ", and a `paragraph` as a child of\nthe `list_item`" - ". The text is then added to the new `paragraph`:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-" - ")\n -> list_item\n -> paragraph\n" @@ -8131,10 +8153,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n``` markdown\n> - aliquando id\n```" - "\n\ncauses the `list_item` (and its child the `paragraph`" - ") to be closed,\nand a new `list_item`" -- " opened up as child of the `list`. A `paragraph`" -- "\nis added as a child of the new `list_item`" +- " opened up as child of the `list`. A `paragraph`\n" +- "is added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph\n" @@ -8147,15 +8170,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "string contents of paragraphs and headings as inlines. At this\n" - "point we have seen all the link reference definitions, so we can\n" - "resolve reference links as we go.\n\n" -- "``` tree\ndocument\n block_quote\n paragraph\n" -- " str \"Lorem ipsum dolor\"\n softbreak\n" -- " str \"sit amet.\"\n" +- "``` tree\n" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n" +- " softbreak\n str \"sit amet.\"\n" - " list (type=bullet tight=true bullet_char=-)\n" - " list_item\n paragraph\n str \"Qui \"\n emph\n" - " str \"quodsi iracundia\"\n list_item\n paragraph\n" -- " str \"aliquando id\"\n```\n\nNotice how the " -- "[line ending] in the first paragraph has\nbeen parsed as a " -- "`softbreak`" +- " str \"aliquando id\"\n```\n\n" +- "Notice how the [line ending] in the first paragraph has\n" +- "been parsed as a `softbreak`" - ", and the asterisks in the first list item\nhave become an " - "`emph`.\n\n" - "### An algorithm for parsing nested emphasis and links\n\n" @@ -8165,100 +8188,109 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- a run of `*` or `_` characters, or\n" - "- a `[` or `![`\n\n" - "we insert a text node with these symbols as its literal content, and we\n" -- "add a pointer to this text node to the [delimiter stack]" -- "(@).\n\n" +- "add a pointer to this text node to the " +- "[delimiter stack](@).\n\n" - "The [delimiter stack] is a doubly linked list. Each\n" -- "element contains a pointer to a text node, plus information about" -- "\n\n- the type of delimiter (`[`, `![" -- "`, `*`, `_`)\n" +- "element contains a pointer to a text node, plus information about\n\n" +- "- the type of delimiter (`[`, `![`" +- ", `*`, `_`)\n" - "- the number of delimiters,\n" -- "- whether the delimiter is \"active\" (all are active to start" -- "), and\n" +- "- whether the delimiter is \"active\"" +- " (all are active to start), and\n" - "- whether the delimiter is a potential opener, a potential closer,\n " - "or both (which depends on what sort of characters precede\n " - "and follow the delimiters).\n\n" -- "When we hit a `]` character, we call the *look for link " -- "or image*\nprocedure (see below)." -- "\n\nWhen we hit the end of the input, we call the *process emphasis*" -- "\nprocedure (see below), with `stack_bottom`" +- "When we hit a `]` character, we call the " +- "*look for link or image*\nprocedure (see below).\n\n" +- "When we hit the end of the input, we call the *process emphasis*\n" +- "procedure (see below), with `stack_bottom`" - " = NULL.\n\n" - "#### *look for link or image*\n\n" - "Starting at the top of the delimiter stack, we look backwards\n" - "through the stack for an opening `[` or `![`" -- " delimiter." -- "\n\n" +- " delimiter.\n\n" - "- If we don't find one, we return a literal text node `" - "]`.\n\n" -- "- If we do find one, but it's not *active*, " -- "we remove the inactive\n " +- "- " +- "If we do find one, but it's not *active*" +- ", we remove the inactive\n " - "delimiter from the stack, and return a literal text node `]`" - ".\n\n" -- "- If we find one and it's active, then we parse ahead " -- "to see if\n " +- "- " +- "If we find one and it'" +- "s active, then we parse ahead to see if\n " - "we have an inline link/image, reference link/image, collapsed reference\n " -- "link/image, or shortcut reference link/image." -- "\n\n + If we don'" -- "t, then we remove the opening delimiter from the\n " +- "link/image, or shortcut reference link/image.\n\n " +- "+ " +- "If we don't, then we remove the opening delimiter from the\n " - "delimiter stack and return a literal text node `]`.\n\n " -- "+ If we do, then\n" -- "\n * We return a link or image node whose children are the inlines\n " +- "+ If we do, then\n\n " +- "* " +- "We return a link or image node whose children are the inlines\n " - "after the text node pointed to by the opening delimiter.\n\n " -- "* We run *process emphasis* on these inlines, with the `[" -- "` opener\n as `stack_bottom`.\n\n " +- "* " +- "We run *process emphasis* on these inlines, with the `[`" +- " opener\n as `stack_bottom`.\n\n " - "* We remove the opening delimiter.\n\n" -- " * If we have a link (and not an image), we also set " -- "all\n `[` delimiters before the opening delimiter to " +- " * " +- "If we have a link (and not an image), we also set all\n " +- "`[` delimiters before the opening delimiter to " - "*inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\n" -- "Parameter `stack_bottom`" -- " sets a lower bound to how far we\ndescend in the [delimiter stack" -- "]. If it is NULL, we can\n" +- "Parameter `stack_bottom` sets a lower bound to how far we\n" +- "descend in the [delimiter stack]" +- ". If it is NULL, we can\n" - "go all the way to the bottom. Otherwise, we stop before\nvisiting " - "`stack_bottom`.\n\n" -- "Let `current_position` point to the element on the [delimiter " -- "stack]\njust above `stack_bottom` (or the first element if " -- "`stack_bottom`\nis NULL).\n\n" -- "We keep track of the `openers_bottom`" -- " for each delimiter\ntype (`*`, `_`" +- "Let `current_position` point to the element on the [" +- "delimiter stack]\njust above `stack_bottom`" +- " (or the first element if `stack_bottom`\n" +- "is NULL).\n\n" +- "We keep track of the `openers_bottom` for each delimiter\n" +- "type (`*`, `_`" - "), indexed to the length of the closing delimiter run\n" - "(modulo 3) and to whether the closing delimiter can also " - "be an\nopener. Initialize this to `stack_bottom`.\n\n" - "Then we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if " -- "needed)\n until we find the first potential closer with delimiter `*`" -- " or `_`" -- ".\n (This will be the potential closer closest\n to the beginning of the input " -- "-- the first one in parse order.)" -- "\n\n- " -- "Now, look back in the stack (staying above `stack_bottom`" -- " and\n the `openers_bottom`" -- " for this delimiter type) for the\n first matching potential opener (\"matching" -- "\" means same delimiter).\n\n- If one is found:\n\n " -- "+ Figure out whether we have emphasis or strong emphasis:\n " +- "- " +- "Move `current_position`" +- " forward in the delimiter stack (if needed)\n " +- "until we find the first potential closer with delimiter `*` or " +- "`_`.\n (This will be the potential closer closest\n " +- to the beginning of the input -- +- " the first one in parse order.)\n\n" +- "- " +- "Now, look back in the stack (staying above `stack_bottom` and\n " +- "the `openers_bottom` for this delimiter type) for the\n " +- "first matching potential opener (\"matching\" means same delimiter).\n\n" +- "- If one is found:\n\n " +- "+ " +- "Figure out whether we have emphasis or strong emphasis:\n " - "if both closer and opener spans have length >= 2, we have\n " - "strong, otherwise regular.\n\n " -- "+ Insert an emph or strong emph node accordingly, after\n " +- "+ " +- "Insert an emph or strong emph node accordingly, after\n " - "the text node corresponding to the opener.\n\n " -- "+ Remove any delimiters between the opener and closer from\n " -- the delimiter stack. -- "\n\n + " +- "+ " +- "Remove any delimiters between the opener and closer from\n " +- "the delimiter stack.\n\n " +- "+ " - "Remove 1 (for regular emph) or 2 (for strong " - "emph) delimiters\n " - "from the opening and closing text nodes. If they become empty\n " - "as a result, remove them and remove the corresponding element\n " - "of the delimiter stack. If the closing node is removed, reset\n " - "`current_position` to the next element in the stack.\n\n" -- "- If none is found:\n" -- "\n + " +- "- If none is found:\n\n " +- "+ " - "Set `openers_bottom` to the element before `current_position`" - ".\n " - "(We know that there are no openers for this kind of closer up to " -- "and\n including this point, so this puts a lower bound on future searches.)" -- "\n\n + " -- "If the closer at `current_position`" -- " is not a potential opener,\n " +- "and\n including this point, so this puts a lower bound on future searches.)\n\n " +- "+ " +- "If the closer at `current_position` is not a potential opener,\n " - "remove it from the delimiter stack (since we know it can't\n " - "be a closer either).\n\n " - "+ Advance `current_position` to the next element in the stack.\n\n" -- "After we're done, we remove all delimiters above `" -- "stack_bottom` from the\ndelimiter stack.\n" +- "After we're done, we remove all delimiters above " +- "`stack_bottom` from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap index 6e20ee4..32193e7 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap @@ -6,30 +6,35 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" - "# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" +- "```\n" +- "1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n" - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" - "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n" - "# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" - "# Links\n\n" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n" -- "[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n" +- "```\n" +- "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n" +- "```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n" - "\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n" - "# Images\n\n" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n" +- "```\n" +- "Here's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n" - "[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n" - "\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" - "\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "\n\n```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n" +- "\n\n```php\n" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n" - " $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" - "# Tables\n\n" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n" -- "| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n" -- "| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n" -- "\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" +- "```\n" +- "Colons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n" +- "| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n" +- "\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n" +- "\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n" - "\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n" - "# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap index c724574..9c604aa 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap @@ -4,9 +4,9 @@ expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers\n\n" -- "```\n# h1 Heading 8-)\n" -- "## h2 Heading\n### h3 Heading\n" -- "#### h4 Heading\n" +- "```\n" +- "# h1 Heading 8-)\n## h2 Heading\n" +- "### h3 Heading\n#### h4 Heading\n" - "##### h5 Heading\n" - "###### h6 Heading\n\n" - "Alternatively, for H1 and H2, an underline-ish style" @@ -38,15 +38,16 @@ input_file: tests/inputs/markdown/github_flavored.md - "_underscores_.\n\n" - "Strong emphasis, aka bold, with **asterisks** or " - "__underscores__.\n\n" -- Combined emphasis with **asterisks and _underscores_* -- "*.\n\n" -- Strikethrough uses two tildes. ~~Scratch this. -- "~~\n\n**This is bold text**\n" -- "\n__This is bold text__\n\n*This is italic text*\n" -- "\n_This is italic text_\n\n~~Strikethrough~~\n\n" -- "------\n\n" +- "Combined emphasis with " +- "**asterisks and _underscores_**.\n\n" +- "Strikethrough uses two tildes. " +- "~~Scratch this.~~\n\n" +- "**This is bold text**\n\n__This is bold text__\n" +- "\n*This is italic text*\n\n_This is italic text_\n" +- "\n~~Strikethrough~~\n\n------\n\n" - "# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n" +- "```\n" +- "1. First ordered list item\n2. Another item\n" - "⋅⋅* Unordered sub-list.\n1. " - "Actual numbers don't matter, just that it's a number\n" - "⋅⋅1. Ordered sub-list\n4. And another item.\n\n" @@ -75,16 +76,16 @@ input_file: tests/inputs/markdown/github_flavored.md - " + Facilisis in pretium nisl aliquet\n" - " - Nulla volutpat aliquam velit\n" - "+ Very easy!\n```\n\n" -- 1. First ordered list item -- "\n2. Another item\n⋅⋅* Unordered sub-list.\n" +- "1. First ordered list item\n" +- "2. Another item\n⋅⋅* Unordered sub-list.\n" - "1. Actual numbers don't matter, just that it'" - "s a number\n⋅⋅1. Ordered sub-list\n" - "4. And another item.\n\n" - ⋅⋅⋅You can have properly indented paragraphs within list items - ". " - "Notice the blank line above, and the leading spaces (at least one, " -- "but we'll use three here to also align the raw Markdown)" -- ".\n\n" +- "but we'" +- "ll use three here to also align the raw Markdown).\n\n" - "⋅⋅⋅To have a line break without a paragraph, you will need " - "to use two trailing spaces.⋅⋅\n" - "⋅⋅⋅Note that this line is separate, but within the same paragraph" @@ -93,21 +94,24 @@ input_file: tests/inputs/markdown/github_flavored.md - "where trailing spaces are not required.)\n\n" - "* Unordered list can use asterisks\n- Or minuses\n" - "+ Or pluses\n\n" -- "1. Make my changes\n 1. Fix bug\n 2. " -- "Improve formatting\n - Make the headings bigger\n" +- 1. Make my changes +- "\n 1. Fix bug\n" +- " 2. Improve formatting\n - Make the headings bigger\n" - "2. Push my commits to GitHub\n" -- "3. Open a pull request\n * Describe my changes\n" +- "3. Open a pull request\n " +- " * Describe my changes\n" - " * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-" - "`, or `*`\n" -- "+ Sub-lists are made by indenting 2 spaces:\n" -- " - Marker character change forces new list start:\n" -- " * Ac tristique libero volutpat at\n " +- "+ Sub-lists are made by indenting 2 spaces:\n " +- "- Marker character change forces new list start:" +- "\n * Ac tristique libero volutpat at\n " - "+ Facilisis in pretium nisl aliquet\n " - "- Nulla volutpat aliquam velit\n" - "+ Very easy!\n\n------\n\n" - "# Task lists\n\n" -- "```\n- [x] Finish my changes\n" +- "```\n" +- "- [x] Finish my changes\n" - "- [ ] Push my commits to GitHub\n" - "- [ ] Open a pull request\n" - "- [x] @mentions, #refs, [links]()" @@ -119,12 +123,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [x] Finish my changes\n" - "- [ ] Push my commits to GitHub\n" - "- [ ] Open a pull request\n" -- "- [x] @mentions, #refs, [links]()" -- ", **formatting**, and tags supported\n" -- "- [x] list syntax required (any unordered or ordered list supported" -- ")\n- [ ] this is a complete item\n" -- "- [ ] this is an incomplete item\n\n------\n\n" +- "- " +- "[x] @mentions, #refs, [links](), " +- "**formatting**, and tags" +- " supported\n" +- "- " +- "[x] list syntax required (any unordered or ordered list supported)\n" +- "- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n" +- "------\n\n" - "# Ignoring Markdown formatting\n\n" - "You can tell GitHub to ignore (or escape) Markdown " - "formatting by using \\ before the Markdown character.\n\n" @@ -171,8 +177,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "\n" - URLs and URLs in angle brackets will automatically get turned into links - ".\nhttp://www.example.com or " -- "" -- " and sometimes\n" +- " and sometimes\n" - "example.com (but not on Github, for example).\n\n" - "Some text to show that the reference links can follow later.\n" - "\n" @@ -208,23 +213,25 @@ input_file: tests/inputs/markdown/github_flavored.md - "Here's our logo (hover to see the title text):\n" - "\nInline-style:\n" - "![" -- "alt text](https://github.com/" -- adam-p/markdown-here/raw/master/src -- "/common/images/icon48.png \"Logo Title Text 1" -- "\")\n\nReference-style:\n![alt text][logo]\n" +- alt text +- "](https://github.com/adam-p" +- /markdown-here/raw/master/src/common/images +- "/icon48.png \"Logo Title Text 1\")\n\n" +- "Reference-style:\n![alt text][logo]\n" - "\n" - "[logo]: https://github.com/adam" - "-p/markdown-here/raw/master/src/common" - "/images/icon48.png \"Logo Title Text 2\"\n\n" - "![" -- "Minion](https://" -- octodex.github.com/images/ -- "minion.png)\n" +- Minion +- "](https://octodex.github.com" +- "/images/minion.png)\n" - "![" -- "Stormtroopocat](https://" -- octodex.github.com/images/ -- "stormtroopocat.jpg \"The Stormtroopocat" -- "\")\n\nLike links, Images also have a footnote style syntax\n" +- Stormtroopocat +- "](https://octodex.github.com" +- "/images/stormtroopocat.jpg \"The " +- "Stormtroopocat\")\n\n" +- "Like links, Images also have a footnote style syntax\n" - "\n![Alt text][id]\n" - "\nWith a reference later in the document defining the URL location:\n" - "\n" @@ -232,9 +239,11 @@ input_file: tests/inputs/markdown/github_flavored.md - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"\n\n" - "------\n\n" -- "# [Footnotes](https://github.com/" +- "# " +- "[Footnotes](https://github.com/" - "markdown-it/markdown-it-footnote)\n\n" -- "```\nFootnote 1 link[^first].\n\n" +- "```\n" +- "Footnote 1 link[^first].\n\n" - "Footnote 2 link[^second].\n\n" - "Inline footnote^[Text of inline footnote] definition.\n\n" - "Duplicated footnote reference[^second].\n\n" @@ -252,7 +261,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Inline `code` has `back-ticks around` it.\n" - "```\n\n" - "Inline `code` has `back-ticks around` it.\n" -- "\n```c#\nusing System.IO.Compression;\n\n" +- "\n```c#\n" +- "using System.IO.Compression;\n\n" - "#pragma warning disable 414, 3021\n\n" - "namespace MyApplication\n{\n" - " [Obsolete(\"...\")]\n" @@ -263,7 +273,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "\");\n" - " return new List(new int[] { 1, " - "2, 3 })\n }\n }\n}\n```\n\n" -- "```css\n@font-face {\n" +- "```css\n" +- "@font-face {\n" - " font-family: Chunkfive; src: url('" - "Chunkfive.otf');\n}\n\n" - "body, .usertext {\n" @@ -314,7 +325,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "__halt_compiler () ; datahere\ndatahere\ndatahere */\n" - "datahere\n```\n\n------\n\n" - "# Tables\n\n" -- "```\nColons can be used to align columns.\n\n" +- "```\n" +- "Colons can be used to align columns.\n\n" - "| Tables | Are | Cool |\n" - "| ------------- |:" - "-------------:| -" @@ -346,7 +358,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n| --- | --- |\n" - "| Backtick | ` |\n| Pipe | \\| |\n```\n\n" - "Colons can be used to align columns.\n\n" -- "| Tables | Are | Cool |\n" +- "| Tables | Are | Cool " +- "|\n" - "| ------------- |:" - "-------------:| -" - "----:|\n" @@ -354,15 +367,16 @@ input_file: tests/inputs/markdown/github_flavored.md - "| col 2 is | centered | $12 |\n" - "| zebra stripes | are neat | $1 |\n" - "\nThere must be at least 3 dashes separating each header cell.\n" -- "The outer pipes (|) are optional, and you don't need to " -- "make the\n" +- "The outer pipes (|) are optional, and you don'" +- "t need to make the\n" - "raw Markdown line up prettily. " - "You can also use inline Markdown.\n\n" -- "Markdown | Less | Pretty\n" -- "--- | --- | ---\n" +- Markdown | Less | Pretty +- "\n--- | --- | ---\n" - "*Still* | `renders` | **nicely**\n" - "1 | 2 | 3\n\n" -- "| First Header | Second Header |\n" +- "| First Header | Second Header " +- "|\n" - "| ------------- | -" - "------------ |\n" - "| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n" @@ -371,9 +385,10 @@ input_file: tests/inputs/markdown/github_flavored.md - "| git diff | Show file differences that haven't been staged |\n\n" - "| Command | Description |\n| --- | --- |\n" - "| `git status` | List all *new or modified* files |\n" -- "| `git diff` | Show file differences that **" -- "haven't been** staged |\n\n" -- "| Left-aligned | Center-aligned | Right-aligned |\n" +- "| `git diff` |" +- " Show file differences that **haven't been** staged |\n\n" +- "| Left-aligned | Center-aligned | Right-aligned " +- "|\n" - "| :--- | :---: | ---: " - "|\n| git status | git status | git status |\n" - "| git diff | git diff | git diff |\n\n" @@ -394,15 +409,18 @@ input_file: tests/inputs/markdown/github_flavored.md - ">> ...by using additional greater-than signs right next to each " - "other...\n> > > ...or with spaces between arrows.\n" - "```\n\n" -- "> Blockquotes are very handy in email to emulate reply text." -- "\n> This line is part of the same quote.\n\nQuote break.\n\n" -- "> This is a very long line that will still be quoted properly when it wraps" -- ". Oh boy let'" +- "> " +- "Blockquotes are very handy in email to emulate reply text.\n> " +- "This line is part of the same quote.\n\nQuote break.\n\n" +- "> " +- "This is a very long line that will still be quoted properly when it wraps. " +- "Oh boy let'" - "s keep writing to make sure this is long enough to actually wrap for everyone. " -- "Oh, you can *put* **Markdown** into a " -- "blockquote.\n\n" +- "Oh, you can *put* **Markdown**" +- " into a blockquote.\n\n" - "> Blockquotes can also be nested...\n>" -- "> ...by using additional greater-than signs right next to each other" +- "> " +- "...by using additional greater-than signs right next to each other" - "...\n> > > ...or with spaces between arrows.\n\n" - "------\n\n" - "# Inline HTML\n\n" @@ -460,7 +478,8 @@ input_file: tests/inputs/markdown/github_flavored.md - www.youtube.com/watch? - "v=YOUTUBE_VIDEO_ID_HERE)\n" - "```\n\n" -- "[![IMAGE ALT TEXT HERE" +- "[![" +- IMAGE ALT TEXT HERE - "](https://upload.wikimedia.org/" - wikipedia/commons/thumb/e/ef/ - YouTube_logo_2015.svg/1200px- diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap index 5f7f84f..03ebc32 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap @@ -3,8 +3,9 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/commonmark_spec.md --- -- "---\ntitle: CommonMark Spec" -- "author: John MacFarlane\nversion: '0.31.2'" +- "---" +- "title: CommonMark Spec\nauthor: John MacFarlane" +- "version: '0.31.2'" - "date: '2024-01-28'" - "license: '[CC-BY-SA 4.0](https" - "://creativecommons.org/licenses/by-sa/" @@ -17,8 +18,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - help from Aaron Swartz) and released in 2004 in the form of a - "[syntax description](https://daringfireball.net/projects" - "/markdown/syntax)\nand a Perl script (" -- "`Markdown.pl`" -- ) for converting Markdown to +- "`Markdown.pl`) for converting Markdown to" - "HTML. In the next decade, dozens of implementations were" - developed in many languages. Some extended the original - "Markdown syntax with conventions for footnotes, tables, and" @@ -31,8 +31,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - What distinguishes Markdown from many other lightweight markup - "syntaxes, which are often easier to write, is its readability." - "As Gruber writes:" -- "> The overriding design goal for Markdown's formatting syntax is" -- "> to make it as readable as possible. The idea is that a\n>" +- ">" +- "The overriding design goal for Markdown's formatting syntax is\n>" +- "to make it as readable as possible. The idea is that a\n>" - "Markdown-formatted document should be publishable as-is, as\n>" - "plain text, without looking like it's been marked up with tags\n>" - "or formatting instructions.\n> (" @@ -43,7 +44,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - asciidoc.org/) with - an equivalent sample of Markdown. Here is a sample of - "AsciiDoc from the AsciiDoc manual:" -- "```\n1. List item one.\n+" +- "```" +- "1. List item one.\n+" - "List item one continued with a second paragraph followed by an\nIndented block." - + - "................" @@ -57,7 +59,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This paragraph is part of the preceding list item.\n\nb. List item b." - "This paragraph belongs to item two of the outer list.\n--\n```" - "And here is the equivalent in Markdown:" -- "```\n1. List item one." +- "```" +- 1. List item one. - " List item one continued with a second paragraph followed by an\n Indented block." - $ ls *.sh - $ mv *.sh ~/tmp @@ -84,9 +87,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The spec says that - "continuation paragraphs need to be indented four spaces, but is" - not fully explicit about sublists. It is natural to think that -- "they, too, must be indented four spaces, but `" -- "Markdown.pl` does\n not require that. This is hardly a \"" -- "corner case,\" and divergences" +- "they, too, must be indented four spaces, but" +- "`Markdown.pl` does\n not require that. This is hardly a" +- "\"corner case,\" and divergences" - between implementations on this issue often lead to surprises for - users in real documents. (See - "[this comment by John\n Gruber" @@ -94,7 +97,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "20170611172104/http://" - article.gmane.org/ - gmane.text.markdown.general/1997).) -- 2. Is a blank line needed before a block quote or heading? +- "2." +- Is a blank line needed before a block quote or heading? - "Most implementations do not require the blank line. However," - "this can lead to unexpected results in hard-wrapped text, and" - also to ambiguities in parsing (note that some implementations @@ -107,14 +111,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - gmane.text.markdown.general/2146).) - "3." - "Is a blank line needed before an indented code block?\n (" -- "`Markdown.pl` requires it, but this is not mentioned in" -- "the\n documentation, and some implementations do not require it.)" +- "`Markdown.pl`" +- "requires it, but this is not mentioned in the" +- "documentation, and some implementations do not require it.)" - "``` markdown\n paragraph\n code?\n ```" - "4." - "What is the exact rule for determining when list items get\n wrapped in" -- "`

    `" -- " tags? Can a list be partially \"loose\" and partially\n \"tight\"" -- "? What should we do with a list like this?" +- "`

    ` tags? Can a list be partially \"loose\"" +- " and partially\n \"tight\"? What should we do with a list like this?" - "``` markdown\n 1. one\n\n 2. two\n 3. three" - " ```\n\n Or this?" - " ``` markdown\n 1. one\n - a\n\n - b\n 2. two" @@ -124,30 +128,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "20170611172104/http://" - article.gmane.org/ - gmane.text.markdown.general/2554).) -- 5. Can list markers be indented? +- "5." +- Can list markers be indented? - Can ordered list markers be right-aligned? - "``` markdown\n 8. item 1\n 9. item 2" -- " 10. item 2a\n ```" -- "6. Is this one list with a thematic break in its second item," +- "10. item 2a\n ```" +- "6." +- "Is this one list with a thematic break in its second item," - or two lists separated by a thematic break? - "``` markdown\n * a\n * * * * *\n * b" - "```" -- "7. When list markers change from numbers to bullets, do we have" +- "7." +- "When list markers change from numbers to bullets, do we have" - "two lists or one? (The Markdown syntax description suggests two," - but the perl scripts and many other implementations produce one.) - "``` markdown\n 1. fee\n 2. fie\n - foe" -- " - fum\n ```" -- 8. What are the precedence rules for the markers of inline structure? +- "- fum\n ```" +- "8." +- What are the precedence rules for the markers of inline structure? - "For example, is the following a valid link, or does the code span" - take precedence ? - "``` markdown" - "[a backtick (`)](/url) and [another" - "backtick (`)](/url).\n ```" -- 9. What are the precedence rules for markers of emphasis and strong +- "9." +- What are the precedence rules for markers of emphasis and strong - "emphasis? For example, how should the following be parsed?" - "``` markdown\n *foo *bar* baz*" - "```" -- 10. What are the precedence rules between block-level and inline-level +- "10." +- What are the precedence rules between block-level and inline-level - "structure? For example, how should the following be parsed?" - "``` markdown" - "- `a long code span can contain a hyphen like this" @@ -157,19 +167,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - does not - "allow this, but does allow blockquotes to include headings.)" - "``` markdown\n - # Heading\n ```" -- "12. Can list items be empty?\n\n ``` markdown\n * a\n *" -- " * b\n ```" +- 12. Can list items be empty? +- " ``` markdown\n * a\n *\n * b\n ```" - 13. Can link references be defined inside block quotes or list items? - " ``` markdown\n > Blockquote [foo].\n >" -- " > [foo]: /url\n ```" +- "> [foo]: /url\n ```" - "14. If there are multiple definitions for the same reference, which takes\n precedence?" - " ``` markdown\n [foo]: /url1" -- " [foo]: /url2\n\n [foo][]" +- "[foo]: /url2\n\n [foo][]" - "```" -- "In the absence of a spec, early implementers consulted `" -- "Markdown.pl`\nto resolve these ambiguities. But" -- "`Markdown.pl`" -- "was quite buggy, and" +- "In the absence of a spec, early implementers consulted" +- "`Markdown.pl`\nto resolve these ambiguities. But" +- "`Markdown.pl` was quite buggy, and" - "gave manifestly bad results in many cases, so it was not a" - satisfactory replacement for a spec. - "Because there is no unambiguous spec, implementations have diverged" @@ -178,14 +187,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - GitHub wiki) - "renders differently on another (say, converting to docbook using" - "pandoc). To make matters worse, because nothing in Markdown counts" -- "as a \"syntax error,\" the divergence often isn't discovered right" -- away. +- "as a \"syntax error,\" the divergence often isn'" +- t discovered right away. - "## About this document" - This document attempts to specify Markdown syntax unambiguously. - It contains many examples with side-by-side Markdown and - HTML. These are intended to double as conformance tests. An -- "accompanying script `spec_tests.py` can be used to run" -- "the tests\nagainst any Markdown program:" +- "accompanying script `spec_tests.py`" +- " can be used to run the tests\nagainst any Markdown program:" - python test/spec_tests.py --spec - spec.txt --program PROGRAM - Since this document describes how Markdown is to be parsed into @@ -205,12 +214,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "But a conforming implementation\ncan use a different renderer and may choose not to" - percent-encode non-ASCII characters in URLs. - "This document is generated from a text file," -- "`spec.txt`" -- ", written" +- "`spec.txt`, written" - in Markdown with a small extension for the side-by-side tests. - "The script `tools/makespec.py` can be used to convert" -- "`spec.txt`" -- into +- "`spec.txt` into" - HTML or CommonMark (which can then be converted into other formats - ). - "In the examples, the `→` character is used to represent tabs." @@ -227,24 +234,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - to a certain encoding. - "A [line](@) is a sequence of zero or more [characters" - "]\nother than line feed (`U+000A`" -- ") or carriage return (`U+000D`" -- "),\nfollowed by a [line ending] or by the end of file." +- ") or carriage return (`U+000D`),\nfollowed by a" +- "[line ending] or by the end of file." - "A [line ending](@) is a line feed (" - "`U+000A`), a carriage return\n(" -- "`U+000D`) not followed by a line feed, or a" -- "carriage return and a\nfollowing line feed." +- "`U+000D`" +- ") not followed by a line feed, or a carriage return and a" +- following line feed. - "A line containing no characters, or a line containing only spaces\n(" -- "`U+0020`) or tabs (`U+0009" -- "`), is called a [blank line](@)." +- "`U+0020`) or tabs (" +- "`U+0009`), is called a" +- "[blank line](@)." - "The following definitions of character classes will be used in this spec:" - "A [Unicode whitespace character](@)" -- "is a character in the Unicode `Zs`" -- " general\ncategory, or a tab (`U+0009`" +- "is a character in the Unicode `Zs` general" +- "category, or a tab (`U+0009`" - "), line feed (`U+000A`), form feed (" - "`U+000C`), or\ncarriage return (" - "`U+000D`)." -- "[Unicode whitespace](@) is a sequence of one or" -- "more\n[Unicode whitespace characters]." +- "[Unicode whitespace](@)" +- " is a sequence of one or more\n[Unicode whitespace characters]." - "A [tab](@) is `U+0009`." - "A [space](@) is `U+0020`." - "An [ASCII control character](@) is a character between" @@ -263,9 +272,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`` ` `` (U+005B–0060)," - "`{`, `|`, `}`, or `~`" - (U+007B–007E). -- "A [Unicode punctuation character](@) is a" -- "character in the Unicode `P`\n(puncuation) or" -- "`S` (symbol) general categories." +- "A [Unicode punctuation character](@)" +- "is a character in the Unicode `P`" +- "(puncuation) or `S` (symbol) general categories." - "## Tabs" - "Tabs in lines are not expanded to [spaces]. However," - "in contexts where spaces help to define block structure," @@ -318,15 +327,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "Normally the `>`" -- that begins a block quote may be followed +- "Normally the `>` that begins a block quote may be followed" - "optionally by a space, which is not considered part of the" -- "content. In the following case `>`" -- "is followed by a tab," +- "content. In the following case `>` is followed by a tab," - which is treated as if it were expanded into three spaces. - "Since one of these spaces is considered part of the\ndelimiter," -- "`foo`" -- is considered to be indented six spaces +- "`foo` is considered to be indented six spaces" - "inside the block quote context, so we get an indented" - code block starting with two spaces. - "````````````````" @@ -374,8 +380,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Insecure characters" -- "For security reasons, the Unicode character `U+0000` must" -- "be replaced\nwith the REPLACEMENT CHARACTER (" +- "For security reasons, the Unicode character `U+0000`" +- must be replaced +- with the REPLACEMENT CHARACTER ( - "`U+FFFD`)." - "## Backslash escapes" - "Any ASCII punctuation character may be backslash-escaped:" @@ -515,9 +522,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Valid HTML entity references and numeric character references - "can be used in place of the corresponding Unicode character," - "with the following exceptions:" -- "- Entity and character references are not recognized in code" -- blocks and code spans. -- "- Entity and character references cannot stand in place of" +- "-" +- "Entity and character references are not recognized in code\n blocks and code spans." +- "-" +- Entity and character references cannot stand in place of - "special characters that define structural elements in\n CommonMark. For example, although" - "`*` can be used\n in place of a literal" - "`*` character, `*` cannot replace\n `*`" @@ -525,8 +533,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Conforming CommonMark parsers need not store information about - whether a particular character was represented in the source - using a Unicode character or an entity reference. -- "[Entity references](@) consist of `&` + any" -- "of the valid\nHTML5 entity names + `;`. The\ndocument" +- "[Entity references](@) consist of `&`" +- " + any of the valid\nHTML5 entity names + `;`" +- ". The\ndocument" - "\nis used as an authoritative source for the valid entity" - references and their corresponding code points. @@ -542,15 +551,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "[Decimal numeric character\nreferences](@)\nconsist of" -- "`&#` + a string of 1--7 arabic digits" -- "+ `;`" -- ". A\nnumeric character reference is parsed as the corresponding" +- "`&#` + a string of 1--" +- "7 arabic digits + `;`. A" +- numeric character reference is parsed as the corresponding - Unicode character. - Invalid Unicode code points will be replaced by -- "the REPLACEMENT CHARACTER (`U+" -- "FFFD`). For security reasons,\nthe code point" -- "`U+0000` will also be replaced by `U+" -- "FFFD`." +- the REPLACEMENT CHARACTER ( +- "`U+FFFD`). For security reasons,\nthe code point" +- "`U+0000` will also be replaced by" +- "`U+FFFD`." - "````````````````" - "````````````````" - example @@ -588,8 +597,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - Although HTML5 does accept some entity references -- "without a trailing semicolon (such as `©`), these" -- "are not\nrecognized here, because it makes the grammar too ambiguous:" +- "without a trailing semicolon (such as `©`" +- "), these are not\nrecognized here, because it makes the grammar too ambiguous:" - "````````````````" - "````````````````" - "example\n©\n.\n

    &copy

    " @@ -712,8 +721,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "quotations, lists, headings, rules, and code blocks." - Some blocks (like - block quotes and list items) contain other blocks; others (like -- "headings and paragraphs) contain [inline](@) content-" -- "--text," +- "headings and paragraphs) contain [inline](@) content" +- "---text," - "links, emphasized text, images, code spans, and so on." - "## Precedence" - Indicators of block structure always take precedence over indicators @@ -737,8 +746,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - one block element does not affect the inline parsing of any other. - "## Container blocks and leaf blocks" - "We can divide blocks into two types:" -- "[container blocks](#container-blocks)" -- ",\nwhich can contain other blocks, and" +- "[container blocks](#container-blocks)," +- "which can contain other blocks, and" - "[leaf blocks](#leaf-blocks),\nwhich cannot." - "# Leaf blocks" - This section describes the different kinds of leaf block that make up a @@ -746,8 +755,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Thematic breaks" - "A line consisting of optionally up to three spaces of indentation, followed" - "by a\nsequence of three or more matching `-`, `_`" -- ", or `*`" -- "characters, each followed" +- ", or `*` characters, each followed" - "optionally by any number of spaces or tabs, forms a" - "[thematic break](@)." - "````````````````" @@ -908,8 +916,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## ATX headings" - "An [ATX heading](@)" - "consists of a string of characters, parsed as inline content, between an" -- "opening sequence of 1--6 unescaped `#` characters and an" -- "optional\nclosing sequence of any number of unescaped `#`" +- "opening sequence of 1--6 unescaped `#`" +- " characters and an optional\nclosing sequence of any number of unescaped `#`" - " characters.\nThe opening sequence of `#`" - "characters must be followed by spaces or tabs, or" - "by the end of line. The optional closing sequence of `#`" @@ -943,8 +951,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ####### foo

    " - "````````````````" - "````````````````" -- "At least one space or tab is required between the `#` characters and" -- "the\nheading's contents, unless the heading is empty. Note that many" +- "At least one space or tab is required between the `#`" +- " characters and the\nheading'" +- "s contents, unless the heading is empty. Note that many" - "implementations currently do not require the space. However, the" - space was required by the - "[original ATX implementation](http://" @@ -1034,8 +1043,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    " - "````````````````" - "````````````````" -- "A sequence of `#`" -- characters with anything but spaces or tabs following it +- "A sequence of `#` characters with anything but spaces or tabs following it" - "is not a closing sequence, but counts as part of the contents of the" - "heading:" - "````````````````" @@ -1053,8 +1061,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo#

    " - "````````````````" - "````````````````" -- "Backslash-escaped `#`" -- " characters do not count as part\nof the closing sequence:" +- "Backslash-escaped `#` characters do not count as part" +- "of the closing sequence:" - "````````````````" - "````````````````" - example @@ -1095,8 +1103,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Setext headings" -- "A [setext heading](@)" -- consists of one or more +- "A [setext heading](@) consists of one or more" - "lines of text, not interrupted by a blank line, of which the first line" - "does not\nhave more than 3 spaces of indentation, followed by\na [" - "setext heading underline]. The lines of text must be such" @@ -1107,13 +1114,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "thematic breaks],\n[list item][list items], or [" - "HTML block][HTML blocks]." - "A [setext heading underline](@) is a sequence of" -- "`=` characters or a sequence of `-` characters, with no more" -- than 3 +- "`=` characters or a sequence of `-`" +- "characters, with no more than 3" - spaces of indentation and any number of trailing spaces or tabs. -- "The heading is a level 1 heading if `=`" -- " characters are used in\nthe [setext heading underline]" -- ", and a level 2 heading if `-`" -- characters are used. The contents of the heading are the result +- "The heading is a level 1 heading if `=` characters are used in\nthe" +- "[setext heading underline], and a level 2 heading if" +- "`-`\ncharacters are used. The contents of the heading are the result" - "of parsing the preceding lines of text as CommonMark inline\ncontent." - "In general, a setext heading need not be preceded or followed by a" - "blank line. However, it cannot interrupt a paragraph, so when a" @@ -1326,8 +1332,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- "If you want a heading with `> foo` as its literal text," -- "you can\nuse backslash escapes:" +- "If you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:" - "````````````````" - "````````````````" - example @@ -1335,8 +1341,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    > foo

    " - "````````````````" - "````````````````" -- "**Compatibility note:**" -- Most existing Markdown implementations +- "**Compatibility note:** Most existing Markdown implementations" - do not allow the text of setext headings to span multiple lines. - But there is no consensus about how to interpret - "``` markdown\nFoo\nbar\n---\nbaz\n```" @@ -1387,10 +1392,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Indented code blocks" -- "An [indented code block](@) is composed of one or" -- "more\n[indented chunks] separated by blank lines.\nAn" -- "[indented chunk](@) is a sequence of non-blank" -- "lines," +- "An [indented code block](@)" +- " is composed of one or more\n[indented chunks]" +- " separated by blank lines.\nAn [indented chunk](@)" +- "is a sequence of non-blank lines," - each preceded by four or more spaces of indentation. - "The contents of the code\nblock are the literal contents of the lines, including trailing" - "[line endings], minus four spaces of indentation." @@ -1509,13 +1514,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Fenced code blocks" -- "A [code fence](@)" -- " is a sequence\nof at least three consecutive backtick characters (" -- "`` ` ``) or\ntildes (`~`" +- "A [code fence](@) is a sequence" +- "of at least three consecutive backtick characters (`` ` ``) or" +- "tildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA" - "[fenced code block](@)" - "begins with a code fence, preceded by up to three spaces of indentation" -- ".\n\nThe line with the opening code fence may optionally contain some text" +- "." +- The line with the opening code fence may optionally contain some text - following the code fence; this is trimmed of leading and trailing - "spaces or tabs and called the [info string](@)" - ". If the [info string] comes" @@ -1796,8 +1802,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo

    - "````````````````" - "````````````````" -- "[Info strings] for tilde code blocks can contain backticks and" -- "tildes:" +- "[Info strings]" +- "for tilde code blocks can contain backticks and tildes:" - "````````````````" - "````````````````" - example @@ -1817,36 +1823,40 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## HTML blocks" -- "An [HTML block](@) is a group of lines that" -- is treated +- "An [HTML block](@)" +- is a group of lines that is treated - as raw HTML (and will not be escaped in HTML output -- ").\n\nThere are seven kinds of [HTML block]" +- ). +- "There are seven kinds of [HTML block]" - ", which can be defined by their" - start and end conditions. The block begins with a line that meets a -- "[start condition](@) (after up to three optional spaces of" -- "indentation).\nIt ends with the first subsequent line that meets a matching" -- "[end condition](@), or the last line of the document," -- "or the last line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the" -- "[end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line." +- "[start condition](@)" +- (after up to three optional spaces of indentation). +- It ends with the first subsequent line that meets a matching +- "[end condition](@)" +- ", or the last line of the document, or the last line of\nthe" +- "[container block](#container-blocks) containing the current HTML" +- "block, if no line is encountered that meets the [end condition]. If" +- "the first line meets both the [start condition] and the [end\ncondition]" +- ", the block will contain just that line." - "1." - "**Start condition:** line begins with the string ``" +- "``" - ", or the end of the line.\\\n**End condition:**" - " line contains an end tag\n`
    `," -- "``, ``, or `" -- "` (case-insensitive; it" +- "``, ``, or" +- "`` (case-insensitive; it" - need not match the start tag). - "2." - "**Start condition:** line begins with the string ``." - "3." -- "**Start condition:** line begins with the string ``." - "4." - "**Start condition:** line begins with the string ``, or\nthe string `/>`.\\" - "**End condition:** line is followed by a [blank line]" - "." - "7." -- "**Start condition:**" -- " line begins with a complete [open tag]\n(with any [tag name]" -- " other than `pre`, `script`,\n`style`, or" -- "`textarea`" -- ") or a complete [closing tag]," +- "**Start condition:** line begins with a complete [open tag]" +- "(with any [tag name] other than `pre`, `script`" +- ",\n`style`, or `textarea`) or a complete [" +- "closing tag]," - "followed by zero or more spaces and tabs, followed by the end of the" -- "line.\\\n**End condition:**" -- "line is followed by a [blank line]." +- "line.\\\n**End condition:** line is followed by a [" +- "blank line]." - "HTML blocks continue until they are closed by their appropriate\n[end condition]" - ", or the last line of the document or other" - "[container\nblock](#container-blocks)" @@ -1903,8 +1912,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "be ignored by the parser and passed through as-is, without changing" - "the parser's state." - "For instance, `
    ` within an HTML block started by"
    -- "`
    `" -- will not affect +- "`
    ` will not affect" - the parser state; as the HTML block was started in by start - "condition 6, it\nwill end at any blank line. This can be surprising:" - "````````````````" @@ -2052,8 +2060,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "int x = 33;\n```" - "````````````````" - "````````````````" -- "To start an [HTML block] with a tag that is *not" -- "* in the" +- "To start an [HTML block] with a tag that is" +- "*not* in the" - "list of block-level tags in (6), you must put the tag" - "by\nitself on the first line (and it must be complete):" - "````````````````" @@ -2090,8 +2098,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`` tag is a nice example. We can surround content with" - "``" - "tags in three different ways. In this case, we get a raw" -- "HTML block, because the `` tag is on a" -- "line by itself:" +- "HTML block, because the ``" +- "tag is on a line by itself:" - "````````````````" - "````````````````" - example @@ -2100,8 +2108,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "In this case, we get a raw HTML block that just includes\nthe" -- "``" -- tag (because it ends with the following blank +- "`` tag (because it ends with the following blank" - "line). So the contents get interpreted as CommonMark:" - "````````````````" - "````````````````" @@ -2111,8 +2118,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "Finally, in this case, the ``" -- " tags are interpreted\nas [raw HTML] *inside*" +- "Finally, in this case, the `` tags are interpreted\nas" +- "[raw HTML] *inside*" - the CommonMark paragraph. (Because - "the tag is not on a line by itself, we get inline" - "HTML\nrather than an [HTML block].)" @@ -2125,8 +2132,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "HTML tags designed to contain literal content\n(`pre`," -- "`script`, `style`, `textarea`), comments" -- ", processing instructions,\nand declarations are treated somewhat differently." +- "`script`, `style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently." - "Instead of ending at the first blank line, these blocks" - end at the first line containing a corresponding end tag. - "As a result, these blocks can contain blank lines:" @@ -2294,8 +2301,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "An HTML block of types 1--6 can interrupt a paragraph," -- "and need not be\npreceded by a blank line." +- An HTML block of types 1-- +- "6 can interrupt a paragraph, and need not be\npreceded by a blank line." - "````````````````" - "````````````````" - example @@ -2325,22 +2332,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "This rule differs from John Gruber's original Markdown syntax" - "specification, which says:" -- "> The only restrictions are that block-level HTML elements —\n>" +- ">" +- "The only restrictions are that block-level HTML elements —\n>" - "e.g. `
    `, `
    `" -- ", `
    `, `

    `, etc. —" -- "must be separated from\n>" +- ", `

    `, `

    `" +- ", etc. — must be separated from\n>" - "surrounding content by blank lines, and the start and end tags of the\n>" - block should not be indented with spaces or tabs. -- "In some ways Gruber's rule is more restrictive than the one" -- "given\nhere:" +- "In some ways Gruber'" +- "s rule is more restrictive than the one given\nhere:" - "- It requires that an HTML block be preceded by a blank line." - "- It does not allow the start tag to be indented." - "- It requires a matching end tag, which it also does not allow to" - be indented. -- "Most Markdown implementations (including some of Gruber's own)" -- "do not\nrespect all of these restrictions." -- "There is one respect, however, in which Gruber's rule is" -- "more liberal\nthan the one given here, since it allows blank lines to occur inside" +- "Most Markdown implementations (including some of Gruber'" +- "s own) do not\nrespect all of these restrictions." +- "There is one respect, however, in which Gruber'" +- s rule is more liberal +- "than the one given here, since it allows blank lines to occur inside" - an HTML block. - There are two reasons for disallowing them here. - "First, it removes the need to parse balanced tags, which is" @@ -2368,14 +2377,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - Some Markdown implementations have adopted a convention of - "interpreting content inside tags as text if the open tag has\nthe attribute" -- "`markdown=1`" -- ". The rule given above seems a simpler and" +- "`markdown=1`. The rule given above seems a simpler and" - "more elegant way of achieving the same expressive power, which is also" - much simpler to parse. - The main potential drawback is that one can no longer paste HTML - "blocks into Markdown documents with 100% reliability. However," -- "*in most cases*" -- "this will work fine, because the blank lines in" +- "*in most cases* this will work fine, because the blank lines in" - "HTML are usually followed by HTML block tags. For example:" - "````````````````" - "````````````````" @@ -2387,8 +2394,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "There are problems, however, if the inner tags are indented" -- "*and*" -- "separated by spaces, as then they will be interpreted as" +- "*and* separated by spaces, as then they will be interpreted as" - "an indented code block:" - "````````````````" - "````````````````" @@ -2403,16 +2409,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Fortunately, blank lines are usually not necessary and can be" -- "deleted. The exception is inside `

    `"
    -- " tags, but as described\n[above][HTML blocks]"
    +- "deleted.  The exception is inside `
    ` tags, but as described"
    +- "[above][HTML blocks]"
     - ", raw HTML blocks starting with `
    `\n*can*"
     - contain blank lines.
     - "## Link reference definitions"
    -- "A [link reference definition](@)"
    -- "consists of a [link label]"
    +- "A [link reference definition](@)\nconsists of a [link label]"
     - ", optionally preceded by up to three spaces of\nindentation, followed"
    -- "by a colon (`:`), optional spaces or tabs ("
    -- "including up to one\n[line ending]), a [link destination],"
    +- "by a colon (`:`"
    +- "), optional spaces or tabs (including up to one\n[line ending]"
    +- "), a [link destination],"
     - "optional spaces or tabs (including up to one\n[line ending]"
     - "), and an optional [link\ntitle]"
     - ", which if it is present must be separated\nfrom the [link destination]"
    @@ -2697,8 +2703,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "## Paragraphs"
     - A sequence of non-blank lines that cannot be interpreted as other
    -- "kinds of blocks forms a [paragraph](@)"
    -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
    +- "kinds of blocks forms a [paragraph](@)."
    +- "The contents of the paragraph are the result of parsing the\nparagraph'"
     - "s raw content as inlines.  The paragraph's raw content"
     - is formed by concatenating the lines and removing initial and final
     - "spaces or tabs.\n\nA simple example with two paragraphs:"
    @@ -2781,29 +2787,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````"
     - "# Container blocks"
    -- "A [container block](#container-blocks) is a block that has"
    -- "other\nblocks as its contents.  There are two basic kinds of container blocks:\n["
    +- "A [container block](#container-blocks)"
    +- is a block that has other
    +- "blocks as its contents.  There are two basic kinds of container blocks:\n["
     - "block quotes] and [list items].\n[Lists]"
     - "are meta-containers for [list items]."
     - We define the syntax for container blocks recursively.  The general
     - "form of the definition is:"
    -- "> If X is a sequence of blocks, then the result of\n>"
    +- ">"
    +- "If X is a sequence of blocks, then the result of\n>"
     - transforming X in such-and-such a way is a container of type Y
     - "> with these blocks as its content."
     - "So, we explain what counts as a block quote or list item by explaining"
    -- how these can be *generated*
    -- from their contents. This should suffice
    -- "to define the syntax, although it does not give a recipe for *parsing"
    -- "*\nthese constructions.  (A recipe is provided below in the section entitled"
    +- how these can be *generated* from their contents. This should suffice
    +- "to define the syntax, although it does not give a recipe for"
    +- "*parsing*"
    +- these constructions.  (A recipe is provided below in the section entitled
     - "[A parsing strategy](#appendix-a-parsing"
     - "-strategy).)"
     - "## Block quotes"
    -- "A [block quote marker](@)"
    -- ",\noptionally preceded by up to three spaces of indentation,"
    -- "consists of (a) the character `>`"
    -- together with a following space of
    -- "indentation, or (b) a single character `>` not followed"
    -- "by a space of\nindentation."
    +- "A [block quote marker](@),"
    +- "optionally preceded by up to three spaces of indentation,"
    +- "consists of (a) the character `>` together with a following space of"
    +- "indentation, or (b) a single character `>`"
    +- " not followed by a space of\nindentation."
     - "The following rules define [block quotes]:"
     - "1."
     - "**Basic case.**  If a string of lines *Ls*"
    @@ -2814,20 +2821,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "2."
     - "**Laziness.**  If a string of lines"
     - "*Ls* constitute a [block\n    quote](#block-quotes)"
    -- with contents *Bs*
    -- ", then the result of deleting\n    the initial [block quote marker]"
    -- from one or
    +- " with contents *Bs*, then the result of deleting\n    the initial ["
    +- "block quote marker] from one or"
     - more lines in which the next character other than a space or tab after the
     - "[block quote marker] is [paragraph continuation\n    text] is a block quote with"
     - "*Bs* as its content."
    -- "[Paragraph continuation text](@)"
    -- is text
    +- "[Paragraph continuation text](@) is text"
     - "that will be parsed as part of the content of a paragraph, but does"
     - not occur at the beginning of the paragraph.
     - "3."
    -- "**Consecutiveness.**"
    -- "  A document cannot contain two [block\n    quotes] in a row unless there is a"
    -- "[blank line] between them."
    +- "**Consecutiveness.**  A document cannot contain two ["
    +- "block\n    quotes] in a row unless there is a [blank line]"
    +- between them.
     - "Nothing else counts as a [block quote](#block-quotes)."
     - "Here is a simple example:"
     - "````````````````"
    @@ -2847,8 +2852,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "baz

    \n" - "````````````````" - "````````````````" -- "The `>` characters can be preceded by up to three spaces of" -- "indentation:" +- "The `>`" +- "characters can be preceded by up to three spaces of indentation:" - "````````````````" - "````````````````" - example @@ -2867,8 +2872,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- "The Laziness clause allows us to omit the `>`" -- " before\n[paragraph continuation text]:" +- "The Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:" - "````````````````" - "````````````````" - example @@ -2911,8 +2916,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • bar
  • \n" - "````````````````" - "````````````````" -- "For the same reason, we can't omit the `> ` in" -- "front of\nsubsequent lines of an indented or fenced code block:" +- "For the same reason, we can't omit the `> `" +- " in front of\nsubsequent lines of an indented or fenced code block:" - "````````````````" - "````````````````" - example @@ -2978,8 +2983,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "(Most current Markdown implementations, including John Gruber's\noriginal" -- "`Markdown.pl`, will parse this example as a single" -- block quote +- "`Markdown.pl`" +- ", will parse this example as a single block quote" - with two paragraphs. But it seems better to allow the author to decide - whether two block quotes or one are wanted.) - "Consecutiveness means that if we put these block quotes together," @@ -3046,8 +3051,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "It is a consequence of the Laziness rule that any number\nof initial" -- "`>`" -- "s may be omitted on a continuation line of a\nnested block quote:" +- "`>`s may be omitted on a continuation line of a" +- "nested block quote:" - "````````````````" - "````````````````" - example @@ -3070,8 +3075,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "When including an indented code block in a block quote,\nremember that the" - "[block quote marker] includes\nboth the `>`" -- and a following space of indentation. So *five spaces* -- " are needed\nafter the `>`:" +- and a following space of indentation. So *five spaces* are needed +- "after the `>`:" - "````````````````" - "````````````````" - example @@ -3082,14 +3087,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## List items" -- "A [list marker](@)" -- " is a\n[bullet list marker] or an [ordered list marker]." +- "A [list marker](@) is a\n[bullet list marker]" +- "or an [ordered list marker]." - "A [bullet list marker](@)\nis a `-`," - "`+`, or `*` character." -- "An [ordered list marker](@)" -- is a sequence of 1--9 arabic digits ( -- "`0-9`), followed by either a\n`.`" -- "character or a `)`" +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`" +- "), followed by either a\n`.` character or a `)`" - character. (The reason for the length - limit is that with 10 digits we start seeing integer overflows - "in some browsers.)\n\nThe following rules define [list items]:" @@ -3097,11 +3101,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "**Basic case.** If a sequence of lines *Ls*" - " constitute a sequence of\n blocks *Bs*" - "starting with a character other than a space or tab, and *M*" -- "is\n a list marker of width *W* followed by 1 ≤ *N*" +- " is\n a list marker of width *W* followed by 1 ≤ *N*" - " ≤ 4 spaces of indentation,\n then the result of prepending" - "*M* and the following spaces to the first line\n of *Ls*" -- ", and indenting subsequent lines of *Ls* by *W +" -- "N* spaces, is a\n list item with *Bs*" +- ", and indenting subsequent lines of *Ls* by" +- "*W + N* spaces, is a\n list item with *Bs*" - as its contents. The type of the list item - (bullet or ordered) is determined by the type of its list marker. - "If the list item is ordered, then it is also assigned a start" @@ -3109,8 +3113,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "1. When the first list item in a [list] interrupts\n a paragraph" - "---that is, when it starts on a line that would" - "otherwise count as [paragraph continuation text]---then (a)" -- "the lines *Ls* must not begin with a blank line, and (" -- "b) if\n the list item is ordered, the start number must be 1." +- the lines *Ls* +- "must not begin with a blank line, and (b) if" +- "the list item is ordered, the start number must be 1." - "2. If any line is a [thematic break][thematic breaks" - "] then\n that line is not a list item." - "For example, let *Ls* be the lines" @@ -3125,8 +3130,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "And let *M* be the marker `1.`, and *N" -- "* = 2. Then rule #1 says" +- "And let *M* be the marker `1.`, and" +- "*N* = 2. Then rule #1 says" - "that the following is an ordered list item with start number 1," - "and the same contents as *Ls*:" - "````````````````" @@ -3203,14 +3208,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Here `two` occurs in the same column as the list marker" -- "`1.`" -- ",\nbut is actually contained in the list item, because there is" +- "`1.`," +- "but is actually contained in the list item, because there is" - sufficient indentation after the last containing blockquote marker. - "The converse is also possible. In the following example, the word" - "`two`" - "occurs far to the right of the initial text of the list item," -- "`one`" -- ", but" +- "`one`, but" - "it is not considered part of the list item, because it is not" - "indented\nfar enough past the blockquote marker:" - "````````````````" @@ -3308,8 +3312,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - 2. **Item starting with indented code.** - " If a sequence of lines *Ls*\n constitute a sequence of blocks" - "*Bs* starting with an indented code\n block, and" -- "*M* is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending" +- "*M* is a list marker of width *W* followed by" +- "one space of indentation, then the result of prepending" - "*M* and the\n following space to the first line of *Ls*" - ", and indenting subsequent lines\n of *Ls* by" - "*W + 1* spaces, is a list item with *Bs*" @@ -3342,9 +3346,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- If the *first* block in the list item is an indented code -- "block,\nthen by rule #2, the contents must be preceded by" -- "*one* space of indentation\nafter the list marker:" +- If the *first* +- "block in the list item is an indented code block," +- "then by rule #2, the contents must be preceded by *one*" +- " space of indentation\nafter the list marker:" - "````````````````" - "````````````````" - example @@ -3413,14 +3418,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - 3. **Item starting with a blank line.** -- If a sequence of lines *Ls* -- "starting with a single [blank line] constitute a (possibly empty)" -- "sequence of blocks *Bs*, and *M* is a list marker" -- "of width *W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of" -- "*Ls* by *W + 1* spaces of indentation," -- "is a\n list item with *Bs*" -- as its contents. +- " If a sequence of lines *Ls*\n starting with a single [blank line" +- "] constitute a (possibly empty)\n sequence of blocks *Bs*, and" +- "*M* is a list marker of width *W*," +- then the result of prepending *M* to the first line of +- "*Ls*, and\n preceding subsequent lines of *Ls* by" +- "*W + 1* spaces of indentation, is a\n list item with" +- "*Bs* as its contents." - "If a line is empty, then it need not be indented." - The type of the - list item (bullet or ordered) is determined by the type of its list @@ -3449,8 +3453,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - A list item can begin with at most one blank line. -- "In the following example, `foo`" -- " is not part of the list\nitem:" +- "In the following example, `foo` is not part of the list" +- "item:" - "````````````````" - "````````````````" - example @@ -3506,8 +3510,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - 4. **Indentation.** If a sequence of lines -- "*Ls*" -- constitutes a list item +- "*Ls* constitutes a list item" - "according to rule #1, #2, or #3, then the result" - "of preceding each line\n of *Ls*" - by up to three spaces of indentation (the same for each line) @@ -3567,8 +3570,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - 5. **Laziness.** If a string of lines - "*Ls* constitute a [list\n item](#list-items)" -- with contents *Bs* -- ", then the result of deleting" +- "with contents *Bs*, then the result of deleting" - some or all of the indentation from one or more lines in which the - "next character other than a space or tab after the indentation is\n [" - "paragraph continuation text] is a" @@ -3701,27 +3703,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "### Motivation" -- "John Gruber's Markdown spec says the following about list items" -- ":" -- "1. \"" -- "List markers typically start at the left margin, but may be indented" +- "John Gruber'" +- "s Markdown spec says the following about list items:" +- "1." +- "\"List markers typically start at the left margin, but may be indented" - by up to three spaces. List markers must be followed by one or more - "spaces or a tab.\"" -- "2. \"" -- "To make lists look nice, you can wrap items with hanging indents." -- "...\n But if you don't want to, you don'" +- "2." +- "\"To make lists look nice, you can wrap items with hanging indents" +- "....\n But if you don't want to, you don'" - "t have to.\"" -- "3. \"List items may consist of multiple paragraphs. Each subsequent" +- "3." +- "\"List items may consist of multiple paragraphs. Each subsequent" - paragraph in a list item must be indented by either 4 spaces or one - "tab.\"" -- "4. \"" -- "It looks nice if you indent every line of the subsequent paragraphs," +- "4." +- "\"It looks nice if you indent every line of the subsequent paragraphs," - "but here again, Markdown will allow you to be lazy.\"" -- "5. \"" +- "5." +- "\"" - "To put a blockquote within a list item, the" - "blockquote's `>`" - "delimiters need to be indented.\"" -- "6. \"" +- "6." +- "\"" - "To put a code block within a list item, the code block needs to be" - "indented twice — 8 spaces or two tabs.\"" - These rules specify that a paragraph under a list item must be indented @@ -3732,14 +3737,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that a block quote must be indented, but not by how much;" - "however, the\nexample given has four spaces indentation. Although nothing is said" - "about other kinds of block-level content, it is certainly reasonable to" -- infer that *all* -- "block elements under a list item, including other" +- "infer that *all* block elements under a list item, including other" - "lists, must be indented four spaces. This principle has been called the" - "*four-space rule*." - "The four-space rule is clear and principled, and if the reference\nimplementation" -- "`Markdown.pl`" -- " had followed it, it probably would have\nbecome the standard. However," -- "`Markdown.pl`" +- "`Markdown.pl` had followed it, it probably would have" +- "become the standard. However, `Markdown.pl`" - allowed paragraphs and - "sublists to start with only two spaces indentation, at least on the" - "outer level. Worse, its behavior was inconsistent: a sublist of an" @@ -3748,17 +3751,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - implementations of Markdown have developed very different rules for - determining what comes under a list item. - "(Pandoc and python-Markdown," -- "for example, stuck with Gruber's syntax description and the four-" -- space +- "for example, stuck with Gruber'" +- s syntax description and the four-space - "rule, while discount, redcarpet, marked, PHP Markdown," -- "and others\nfollowed `Markdown.pl`" -- "'s behavior more closely.)" +- "and others\nfollowed `Markdown.pl`'" +- s behavior more closely.) - "Unfortunately, given the divergences between implementations, there" - is no way to give a spec for list items that will be guaranteed not - "to break any existing documents. However, the spec given here should" - correctly handle lists formatted with either the four-space rule or -- "the more forgiving `Markdown.pl` behavior, provided they" -- "are laid out\nin a way that is natural for a human to read." +- "the more forgiving `Markdown.pl`" +- "behavior, provided they are laid out" +- in a way that is natural for a human to read. - The strategy here is to let the width and indentation of the list marker - determine the indentation necessary for blocks to fall under the list - "item, rather than having a fixed and arbitrary number. The writer can" @@ -3773,14 +3777,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - unnatural. It is quite unintuitive that - "``` markdown\n- foo\n\n bar\n\n - baz\n```" - "should be parsed as two lists with an intervening paragraph," -- "``` html\n
      " -- "
    • foo
    • \n
    " -- "

    bar

    \n
      " +- "``` html" +- "
        \n
      • foo
      • " +- "
      \n

      bar

      \n
        " - "
      • baz
      • \n
      " - "```" - "as the four-space rule demands, rather than a single list," -- "``` html\n
        \n
      • " -- "

        foo

        " +- "``` html" +- "
          \n
        • \n

          foo

          " - "

          bar

          \n
            " - "
          • baz
          • \n
          " - "
        • \n
        \n```" @@ -3789,20 +3793,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Would it help to adopt a two-space rule? The problem is that such - "a rule, together with the rule allowing up to three spaces of indentation" - "for\nthe initial list marker, allows text that is indented" -- "*less than*" -- " the\noriginal list marker to be included in the list item. For example," +- "*less than* the" +- "original list marker to be included in the list item. For example," - "`Markdown.pl` parses" - "``` markdown\n - one\n\n two\n```" - "as a single list item, with `two` a continuation paragraph:" -- "``` html\n
          \n
        • " -- "

          one

          \n

          two

          " -- "
        • \n
        \n```\n\nand similarly" -- "``` markdown\n> - one\n>\n> two\n```\n\nas" -- "``` html\n
        \n
          " -- "
        • \n

          one

          " +- "``` html" +- "
            \n
          • \n

            one

            " - "

            two

            \n
          • " -- "
          \n
        \n```" -- This is extremely unintuitive. +- "
      \n```\n\nand similarly" +- "``` markdown\n> - one\n>\n> two\n```\n\nas" +- "``` html" +- "
      \n
        \n
      • " +- "

        one

        \n

        two

        " +- "
      • \n
      \n
      " +- "```\n\nThis is extremely unintuitive." - "Rather than requiring a fixed indent from the margin, we could require" - "a fixed indent (say, two spaces, or even one space) from" - the list marker (which @@ -3822,8 +3827,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown\n1. foo\n\n indented code\n```" - where the code is indented eight spaces. - "The spec above, by contrast, will" -- "parse this text as expected, since the code block's indentation" -- "is measured\nfrom the beginning of `foo`." +- "parse this text as expected, since the code block'" +- "s indentation is measured\nfrom the beginning of `foo`." - The one case that needs special treatment is a list item that *starts* - with indented code. - "How much indentation is required in that case, since\nwe don'" @@ -3835,30 +3840,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule in cases where the list marker plus its initial indentation - "takes four spaces (a common case), but diverge in other cases." - "## Lists" -- "A [list](@)" -- " is a sequence of one or more\nlist items [of the same type]" -- ". The list items\nmay be separated by any number of blank lines." +- "A [list](@) is a sequence of one or more\nlist items" +- "[of the same type]. The list items" +- may be separated by any number of blank lines. - "Two list items are [of the same type](@)" - "if they begin with a [list marker] of the same type." - Two list markers are of the - "same type if (a) they are bullet list markers using the same character\n(" -- "`-`, `+`, or `*`) or (b" -- ") they are ordered list numbers with the same\ndelimiter (either" -- "`.` or `)`)." +- "`-`, `+`, or `*`" +- ) or (b) they are ordered list numbers with the same +- "delimiter (either `.` or `)`)." - "A list is an [ordered list](@)" - "if its constituent list items begin with\n[ordered list markers], and a" -- "[bullet list](@)" -- " if its constituent list\nitems begin with [bullet list markers]." -- "The [start number](@)" -- "of an [ordered list] is determined by the list number of" +- "[bullet list](@) if its constituent list\nitems begin with [" +- "bullet list markers]." +- "The [start number](@)\nof an [ordered list]" +- is determined by the list number of - its initial list item. The numbers of subsequent list items are - disregarded. -- "A list is [loose](@)" -- if any of its constituent +- "A list is [loose](@) if any of its constituent" - "list items are separated by blank lines, or if any of its constituent" - list items directly contain two block-level elements with a blank line -- "between them. Otherwise a list is [tight](@)" -- "." +- "between them. Otherwise a list is [tight](@)." - (The difference in HTML output is that paragraphs in a loose list - "are\nwrapped in `

      `" - "tags, while paragraphs in a tight list are not.)" @@ -3894,33 +3897,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    • baz
    • \n
    " - "````````````````" - "````````````````" -- "`Markdown.pl` does not allow this, through fear of" -- "triggering a list\nvia a numeral in a hard-wrapped line:" -- "``` markdown\nThe number of windows in my house is\n14." +- "`Markdown.pl`" +- "does not allow this, through fear of triggering a list" +- "via a numeral in a hard-wrapped line:" +- "``` markdown" +- "The number of windows in my house is\n14." - "The number of doors is 6.\n```" -- "Oddly, though, `Markdown.pl` *does* allow" -- "a blockquote to\ninterrupt a paragraph, even though the same considerations might" -- apply. +- "Oddly, though, `Markdown.pl` *does*" +- allow a blockquote to +- "interrupt a paragraph, even though the same considerations might\napply." - "In CommonMark, we do allow lists to interrupt paragraphs, for" - "two reasons. First, it is natural and not uncommon for people" - "to start lists without blank lines:" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" -- "- a plane ticket\n```\n\nSecond, we are attracted to a" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket" +- "```\n\nSecond, we are attracted to a" - ">" -- "[principle of uniformity](@)" -- ":\n> if a chunk of text has a certain\n>" +- "[principle of uniformity](@):\n>" +- "if a chunk of text has a certain\n>" - "meaning, it will continue to have the same meaning when put into a\n>" - container block (such as a list item or blockquote). - "(Indeed, the spec for [list items] and [block quotes]" -- "presupposes\nthis principle.) This principle implies that if" -- "``` markdown\n * I need to buy\n - new shoes\n - a coat" -- " - a plane ticket\n```" +- " presupposes\nthis principle.) This principle implies that if" +- "``` markdown" +- " * I need to buy\n - new shoes\n - a coat\n - a plane ticket" +- "```" - "is a list item containing a paragraph followed by a nested sublist," - as all Markdown implementations agree it is (though the paragraph - "may be rendered without `

    ` tags, since the list is \"" - "tight\"),\nthen" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" -- "- a plane ticket\n```" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket" +- "```" - by itself should be a paragraph followed by a nested sublist. - Since it is well established Markdown practice to allow lists to - "interrupt paragraphs inside list items, the [principle of\nuniformity]" @@ -3931,7 +3939,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - even inside other list items.) - In order to solve the problem of unwanted lists in paragraphs with - "hard-wrapped numerals, we allow only lists starting with `1`" -- "to\ninterrupt paragraphs. Thus," +- " to\ninterrupt paragraphs. Thus," - "````````````````" - "````````````````" - example @@ -4046,9 +4054,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- e\n" - "````````````````" - "````````````````" -- "And here, `3. c` is treated as in indented code" -- "block,\nbecause it is indented four spaces and preceded by a" -- blank line. +- "And here, `3. c`" +- "is treated as in indented code block," +- "because it is indented four spaces and preceded by a\nblank line." - "````````````````" - "````````````````" - example @@ -4220,19 +4228,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">" - "````````````````" - "````````````````" -- "`hi` is parsed as code, leaving the backtick at the end" -- "as a literal\nbacktick." +- "`hi`" +- "is parsed as code, leaving the backtick at the end as a literal" +- backtick. - "## Code spans" - "A [backtick string](@)" - "is a string of one or more backtick characters (`` ` ``" - ") that is neither\npreceded nor followed by a backtick." -- "A [code span](@) begins with a backtick string and ends" -- "with\na backtick string of equal length. The contents of the code span are" +- "A [code span](@)" +- begins with a backtick string and ends with +- a backtick string of equal length. The contents of the code span are - "the characters between these two backtick strings, normalized in the\nfollowing ways:" - "- First, [line endings] are converted to [spaces]." -- "- If the resulting string both begins *and*" -- " ends with a [space]\n character, but does not consist entirely of [space" -- "]\n characters, a single [space] character is removed from the" +- "- If the resulting string both begins *and* ends with a [space]" +- "character, but does not consist entirely of [space]\n characters, a single [" +- "space] character is removed from the" - front and back. This allows you to include code that begins - "or ends with backtick characters, which must be separated by" - whitespace from the opening or closing backtick strings. @@ -4278,8 +4288,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a

    " - "````````````````" - "````````````````" -- "Only [spaces], and not [unicode whitespace] in general" -- ", are\nstripped in this way:" +- "Only [spaces], and not [unicode whitespace]" +- " in general, are\nstripped in this way:" - "````````````````" - "````````````````" - example @@ -4334,8 +4344,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Backslash escapes are never needed, because one can always choose a\nstring of" -- "*n*" -- "backtick characters as delimiters, where the code does" +- "*n* backtick characters as delimiters, where the code does" - not contain any strings of exactly *n* backtick characters. - "````````````````" - "````````````````" @@ -4356,8 +4365,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Code span backticks have higher precedence than any other inline - constructs except HTML tags and autolinks. - "Thus, for example, this is" -- "not parsed as emphasized text, since the second `*` is part of" -- "a code\nspan:" +- "not parsed as emphasized text, since the second `*`" +- " is part of a code\nspan:" - "````````````````" - "````````````````" - example @@ -4446,20 +4455,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[Markdown syntax\ndescription" - "](https://daringfireball.net/projects/markdown" - "/syntax#em) says:" -- "> Markdown treats asterisks (`*`) and" -- "underscores (`_`" -- ") as indicators of\n> emphasis. Text wrapped with one `*` or" -- "`_` will be wrapped with an HTML\n>" -- "`` tag; double `*`'s or `_" -- "`'s will be wrapped with an HTML ``" -- "> tag." +- ">" +- "Markdown treats asterisks (`*`" +- ") and underscores (`_`) as indicators of\n>" +- "emphasis. Text wrapped with one `*` or `_`" +- " will be wrapped with an HTML\n> ``" +- "tag; double `*`'s or `_`'" +- "s will be wrapped with an HTML ``\n>" +- tag. - "This is enough for most users, but these rules leave much undecided," - especially when it comes to nested emphasis. The original -- "`Markdown.pl` test suite makes it clear that triple `*" -- "**` and\n`___`" +- "`Markdown.pl` test suite makes it clear that triple" +- "`***` and\n`___`" - "delimiters can be used for strong emphasis, and most" - "implementations have also allowed the following patterns:" -- "``` markdown\n***strong emph***" +- "``` markdown" +- "***strong emph***" - "***strong** in emph*" - "***emph* in strong**" - "**in strong *emph***" @@ -4467,13 +4478,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The following patterns are less widely supported, but the intent" - is clear and they are useful (especially in contexts like bibliography - "entries):" -- "``` markdown\n*emph *with emph* in it*" +- "``` markdown" +- "*emph *with emph* in it*" - "**strong **with strong** in it**\n```" - "Many implementations have also restricted intraword emphasis to\nthe `*`" - "forms, to avoid unwanted emphasis in words containing" - internal underscores. (It is best practice to put these in code - "spans, but users often do not.)" -- "``` markdown\ninternal emphasis: foo*bar*baz" +- "``` markdown" +- "internal emphasis: foo*bar*baz" - "no emphasis: foo_bar_baz\n```" - "The rules given below capture all of these patterns, while allowing" - for efficient parsing strategies that do not backtrack. @@ -4483,36 +4496,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`*` character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped" - "`_` character." -- "A [left-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not followed by" -- "[Unicode whitespace]," -- "and either (2a) not followed by a [Unicode" -- "punctuation character], or\n(2b) followed by a [" -- "Unicode punctuation character] and\npreceded by [" -- "Unicode whitespace] or a [Unicode punctuation" -- "character].\nFor purposes of this definition, the beginning and the end of" +- "A [left-flanking delimiter run](@) is\na" +- "[delimiter run] that is (1) not followed by [" +- "Unicode whitespace]," +- "and either (2a) not followed by a [" +- "Unicode punctuation character], or" +- "(2b) followed by a [Unicode punctuation character" +- "] and\npreceded by [Unicode whitespace] or a [" +- "Unicode punctuation character]." +- "For purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. -- "A [right-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not preceded by" -- "[Unicode whitespace]," -- "and either (2a) not preceded by a [Unicode" -- "punctuation character], or\n(2b) preceded by a [" -- "Unicode punctuation character] and\nfollowed by [" -- "Unicode whitespace] or a [Unicode punctuation" -- "character].\nFor purposes of this definition, the beginning and the end of" +- "A [right-flanking delimiter run](@) is\na" +- "[delimiter run] that is (1) not preceded by [" +- "Unicode whitespace]," +- "and either (2a) not preceded by a [" +- "Unicode punctuation character], or" +- "(2b) preceded by a [Unicode punctuation character" +- "] and\nfollowed by [Unicode whitespace] or a [" +- "Unicode punctuation character]." +- "For purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. - Here are some examples of delimiter runs. -- " - left-flanking but not right-flanking:\n\n ```" -- " ***abc\n _abc" +- "- left-flanking but not right-flanking:" +- " ```\n ***abc\n _abc" - "**\"abc\"\n _\"abc\"\n ```" -- " - right-flanking but not left-flanking:\n\n ```" -- " abc***\n abc_" +- "- right-flanking but not left-flanking:" +- " ```\n abc***\n abc_" - "\"abc\"**\n \"abc\"_\n ```" -- " - Both left and right-flanking:\n\n ```" -- " abc***def\n \"abc\"_\"def\"" -- "```" -- " - Neither left nor right-flanking:\n\n ```" -- " abc *** def\n a _ b\n ```" +- "- Both left and right-flanking:" +- " ```\n abc***def" +- "\"abc\"_\"def\"\n ```" +- "- Neither left nor right-flanking:" +- " ```\n abc *** def\n a _ b\n ```" - (The idea of distinguishing left-flanking and right-flanking - delimiter runs based on the character before and the character - "after comes from Roopesh Chander's" @@ -4530,51 +4545,48 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "iff (if and only if) it is part of a [" - "left-flanking delimiter run]." - "2." -- "A single `_`" -- " character [can open emphasis] iff\n it is part of a [" -- "left-flanking delimiter run]" -- "and either (a) not part of a [right-flanking" -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "A single `_` character [can open emphasis] iff" +- "it is part of a [left-flanking delimiter run]" +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a" +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character]." - "3." - "A single `*` character [can close emphasis](@)" - "iff it is part of a [right-flanking delimiter run" - "]." - "4." -- "A single `_`" -- " character [can close emphasis] iff\n it is part of a [" -- "right-flanking delimiter run]" -- "and either (a) not part of a [left-flanking" -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" +- "A single `_` character [can close emphasis] iff" +- "it is part of a [right-flanking delimiter run]" +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a" +- "[left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character]." - "5." - "A double `**` [can open strong emphasis](@)" - "iff it is part of a [left-flanking delimiter run" - "]." - "6." -- "A double `__`" -- " [can open strong emphasis] iff\n it is part of a [" -- "left-flanking delimiter run]" -- "and either (a) not part of a [right-flanking" -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "A double `__` [can open strong emphasis] iff" +- "it is part of a [left-flanking delimiter run]" +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a" +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character]." - "7." - "A double `**` [can close strong emphasis](@)" - "iff it is part of a [right-flanking delimiter run" - "]." - "8." -- "A double `__`" -- " [can close strong emphasis] iff\n it is part of a [" -- "right-flanking delimiter run]" -- "and either (a) not part of a [left-flanking" -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" +- "A double `__` [can close strong emphasis] iff" +- "it is part of a [right-flanking delimiter run]" +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a" +- "[left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character]." -- "9. Emphasis begins with a delimiter that [can open emphasis" -- "] and ends\n with a delimiter that [can close emphasis]" +- "9." +- "Emphasis begins with a delimiter that [can open emphasis]" +- " and ends\n with a delimiter that [can close emphasis]" - ", and that uses the same\n character (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs" @@ -4582,8 +4594,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "open and close emphasis, then the sum of the lengths of the" - delimiter runs containing the opening and closing delimiters - "must not be a multiple of 3 unless both lengths are\n multiples of 3." -- "10. Strong emphasis begins with a delimiter that\n [can open strong emphasis" -- "] and ends with a delimiter that\n [can close strong emphasis]" +- "10." +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis]" +- " and ends with a delimiter that\n [can close strong emphasis]" - ", and that uses the same character\n (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs" @@ -4594,38 +4607,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - are multiples of 3. - "11." - "A literal `*` character cannot occur at the beginning or end of" -- "`*`-delimited emphasis or `**`-" -- "delimited strong emphasis, unless it\n is backslash-escaped." +- "`*`-delimited emphasis or `**`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." - "12." - "A literal `_` character cannot occur at the beginning or end of" -- "`_`-delimited emphasis or `__`-" -- "delimited strong emphasis, unless it\n is backslash-escaped." +- "`_`-delimited emphasis or `__`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings," - "the following principles resolve ambiguity:" - "13." - "The number of nestings should be minimized. Thus, for example," -- "an interpretation `...` is always" -- preferred to +- "an interpretation `...`" +- is always preferred to - "`...`." - "14." -- "An interpretation `..." -- "` is always\n preferred to" +- An interpretation +- "`...` is always\n preferred to" - "`...`." -- "15. When two potential emphasis or strong emphasis spans overlap," +- "15." +- "When two potential emphasis or strong emphasis spans overlap," - so that the second begins before the first ends and ends after - "the first ends, the first takes precedence. Thus, for example," -- "`*foo _bar* baz_` is parsed as `" -- "foo _bar baz_` rather" -- "than `*foo bar* baz" -- "`." -- 16. When there are two potential emphasis or strong emphasis spans +- "`*foo _bar* baz_` is parsed as" +- "`foo _bar baz_`" +- " rather\n than" +- "`*foo bar* baz`" +- "." +- "16." +- When there are two potential emphasis or strong emphasis spans - "with the same closing delimiter, the shorter one (the one that" - "opens later) takes precedence. Thus, for example," -- "`**foo **bar baz**` is parsed" -- "as `**foo bar baz" -- "`\n rather than" +- "`**foo **bar baz**`" +- is parsed as +- "`**foo bar baz`" +- rather than - "`foo **bar baz`" - "." - "17." @@ -4646,8 +4664,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">" - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `*`" -- " is followed by\nwhitespace, and hence not part of a [" +- "This is not emphasis, because the opening `*` is followed by" +- "whitespace, and hence not part of a [" - "left-flanking delimiter run]:" - "````````````````" - "````````````````" @@ -4656,8 +4674,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a * foo bar*

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `*`" -- is preceded +- "This is not emphasis, because the opening `*` is preceded" - "by an alphanumeric and followed by punctuation, and hence" - "not part of a [left-flanking delimiter run]:" - "````````````````" @@ -4711,8 +4728,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">" - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `_`" -- " is followed by\nwhitespace:" +- "This is not emphasis, because the opening `_` is followed by" +- "whitespace:" - "````````````````" - "````````````````" - example @@ -4720,8 +4737,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _ foo bar_

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the opening `_`" -- " is preceded\nby an alphanumeric and followed by punctuation:" +- "This is not emphasis, because the opening `_` is preceded" +- "by an alphanumeric and followed by punctuation:" - "````````````````" - "````````````````" - example @@ -4755,8 +4772,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - тся_

    - "````````````````" - "````````````````" -- "Here `_`" -- "does not generate emphasis, because the first delimiter run" +- "Here `_` does not generate emphasis, because the first delimiter run" - "is right-flanking and the second left-flanking:" - "````````````````" - "````````````````" @@ -4786,8 +4802,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "_foo*\n.\n

    _foo*

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the closing `*`" -- " is preceded by\nwhitespace:" +- "This is not emphasis, because the closing `*` is preceded by" +- "whitespace:" - "````````````````" - "````````````````" - example @@ -4803,10 +4819,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the second `*`" -- " is\npreceded by punctuation and followed by an alphanumeric" -- "(hence it is not part of a [right-flanking delimiter" -- "run]:" +- "This is not emphasis, because the second `*` is" +- preceded by punctuation and followed by an alphanumeric +- "(hence it is not part of a [" +- "right-flanking delimiter run]:" - "````````````````" - "````````````````" - example @@ -4833,8 +4849,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Rule 4:" -- "This is not emphasis, because the closing `_`" -- " is preceded by\nwhitespace:" +- "This is not emphasis, because the closing `_` is preceded by" +- "whitespace:" - "````````````````" - "````````````````" - example @@ -4842,8 +4858,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    _foo bar _

    " - "````````````````" - "````````````````" -- "This is not emphasis, because the second `_`" -- " is\npreceded by punctuation and followed by an alphanumeric:" +- "This is not emphasis, because the second `_` is" +- "preceded by punctuation and followed by an alphanumeric:" - "````````````````" - "````````````````" - example @@ -4915,8 +4931,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ** foo bar**

    " - "````````````````" - "````````````````" -- "This is not strong emphasis, because the opening `**`" -- is preceded +- "This is not strong emphasis, because the opening `**` is preceded" - "by an alphanumeric and followed by punctuation, and hence" - "not part of a [left-flanking delimiter run]:" - "````````````````" @@ -4962,8 +4977,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo bar__

    - "````````````````" - "````````````````" -- "This is not strong emphasis, because the opening `__`" -- " is preceded\nby an alphanumeric and followed by punctuation:" +- "This is not strong emphasis, because the opening `__` is preceded" +- "by an alphanumeric and followed by punctuation:" - "````````````````" - "````````````````" - example @@ -5028,8 +5043,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "(Nor can it be interpreted as an emphasized `*foo bar *`" - ", because of\nRule 11.)" -- "This is not strong emphasis, because the second `**`" -- " is\npreceded by punctuation and followed by an alphanumeric:" +- "This is not strong emphasis, because the second `**` is" +- "preceded by punctuation and followed by an alphanumeric:" - "````````````````" - "````````````````" - example @@ -5087,8 +5102,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    __foo bar __

    " - "````````````````" - "````````````````" -- "This is not strong emphasis, because the second `__`" -- " is\npreceded by punctuation and followed by an alphanumeric:" +- "This is not strong emphasis, because the second `__` is" +- "preceded by punctuation and followed by an alphanumeric:" - "````````````````" - "````````````````" - example @@ -5214,7 +5229,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown" - "

    foobar" - "baz\n```\n\n\nis precluded by the condition that a delimiter that" +- ">\n```" +- is precluded by the condition that a delimiter that - "can both open and close (like the `*` after `foo`" - ")\ncannot form emphasis if the sum of the lengths of" - the delimiter runs containing the opening and @@ -5257,8 +5273,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "When the lengths of the interior closing and opening\ndelimiter runs are" -- "*both*" -- " multiples of 3, though,\nthey can match to create emphasis:" +- "*both* multiples of 3, though," +- "they can match to create emphasis:" - "````````````````" - "````````````````" - example @@ -5482,8 +5498,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Note that when delimiters do not match evenly, Rule 11 determines" -- "that the excess literal `*`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `*` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````" - "````````````````" - example @@ -5588,8 +5604,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Note that when delimiters do not match evenly, Rule 12 determines" -- "that the excess literal `_`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `_` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````" - "````````````````" - example @@ -5822,41 +5838,47 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Links" -- "A link contains [link text] (the visible text), a [link" -- "destination]" +- "A link contains [link text] (the visible text), a [" +- "link destination]" - "(the URI that is the link destination), and optionally a [" - "link title].\nThere are two basic kinds of links in Markdown. In" - "[inline links] the" - "destination and title are given immediately after the link text. In\n[reference links]" - " the destination and title are defined elsewhere in\nthe document." -- "A [link text](@)" -- " consists of a sequence of zero or more\ninline elements enclosed by square brackets (" -- "`[` and `]`). The\nfollowing rules apply:" -- "- Links may not contain other links, at any level of nesting. If" +- "A [link text](@) consists of a sequence of zero or more" +- "inline elements enclosed by square brackets (`[` and `]`" +- "). The\nfollowing rules apply:" +- "-" +- "Links may not contain other links, at any level of nesting. If" - multiple otherwise valid link definitions appear nested inside each - "other, the inner-most definition is used." -- "- Brackets are allowed in the [link text]" +- "-" +- "Brackets are allowed in the [link text]" - only if (a) they - are backslash-escaped or (b) they appear as a matched pair of - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket" - "`]`." -- "- Backtick [code spans], [autolinks], and raw" -- "[HTML tags] bind more tightly" +- "-" +- "Backtick [code spans], [autolinks], and raw [" +- "HTML tags] bind more tightly" - "than the brackets in link text. Thus, for example," -- "`` [foo`]` `` could not be a link text" -- ", since the second `]`\n is part of a code span." -- "- The brackets in link text bind more tightly than markers for\n [" -- "emphasis and strong emphasis]. Thus, for example," +- "`` [foo`]` ``" +- "could not be a link text, since the second `]`" +- is part of a code span. +- "-" +- "The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis" +- "]. Thus, for example," - "`*[foo*](url)` is a link." - "A [link destination](@) consists of either" - "-" - "a sequence of zero or more characters between an opening `<` and a\n closing" - "`>` that contains no line endings or unescaped\n `<` or" - "`>` characters, or" -- "- a nonempty sequence of characters that does not start with `<`" -- ",\n does not include [ASCII control characters][" -- "ASCII control character]\n or [space]" +- "-" +- "a nonempty sequence of characters that does not start with `<`," +- "does not include [ASCII control characters][ASCII control character" +- "]\n or [space]" - "character, and includes parentheses only if (a) they are" - backslash-escaped or (b) they are part of a balanced pair of - unescaped parentheses. @@ -5865,33 +5887,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A [link title](@) consists of either" - "-" - "a sequence of zero or more characters between straight double-quote\n characters (" -- "`\"`), including a `\"`" -- " character only if it is\n backslash-escaped, or" +- "`\"`), including a `\"` character only if it is" +- "backslash-escaped, or" - "-" - "a sequence of zero or more characters between straight single-quote\n characters (" -- "`'`), including a `'`" -- " character only if it is\n backslash-escaped, or" +- "`'`), including a `'` character only if it is" +- "backslash-escaped, or" - "-" - "a sequence of zero or more characters between matching parentheses\n (" -- "`(...)`), including a `(` or `" -- ")` character only if it is\n backslash-escaped." +- "`(...)`), including a `(` or" +- "`)` character only if it is\n backslash-escaped." - "Although [link titles] may span multiple lines, they may not contain\na [" - "blank line]." - "An [inline link](@) consists of a [link text]" -- "followed immediately\nby a left parenthesis `(`" -- ", an optional [link destination], an optional\n[link title]" -- ", and a right parenthesis `)`" -- "." +- " followed immediately\nby a left parenthesis `(`, an optional [link destination" +- "], an optional\n[link title], and a right parenthesis" +- "`)`." - "These four components may be separated by spaces, tabs, and up to one" - "line\nending.\nIf both [link destination] and [link title]" -- "are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending." +- "are present, they *must* be" +- "separated by spaces, tabs, and up to one line ending." - "The link's text consists of the inlines contained\nin the [link text" - "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing" -- "`<...>` if present, with backslash-escapes in" -- "effect as described\nabove. The link'" -- "s title consists of the link title, excluding its" +- "`<...>`" +- " if present, with backslash-escapes in effect as described\nabove. The link" +- "'s title consists of the link title, excluding its" - "enclosing delimiters, with backslash-escapes in effect" - "as described\nabove.\n\nHere is a simple inline link:" - "````````````````" @@ -6174,20 +6195,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - link

    - "````````````````" - "````````````````" -- "(Note: `Markdown.pl` did allow double quotes inside a" -- "double-quoted\ntitle, and its test suite included a test demonstrating this." +- "(Note: `Markdown.pl`" +- did allow double quotes inside a double-quoted +- "title, and its test suite included a test demonstrating this." - But it is hard to see a good rationale for the extra complexity this - "brings, since there are already many ways---backslash escaping," - "entity and numeric character references, or using a different" - quote type for the enclosing title---to write titles containing -- "double quotes. `Markdown.pl`" -- "'s handling of titles has a number" +- "double quotes. `Markdown.pl`'" +- s handling of titles has a number - "of other strange features. For example, it allows single-quoted" - "titles in inline links, but not reference links. And, in" - "reference links but not inline links, it allows a title to begin\nwith" - "`\"` and end with `)`." -- "`Markdown.pl`" -- 1.0.1 even allows +- "`Markdown.pl` 1.0.1 even allows" - "titles with no closing quotation mark, though 1.0.2b8" - "does not.\nIt seems preferable to adopt a simple, rational rule that works" - the same way in inline links and link reference definitions.) @@ -6315,8 +6336,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo *bar

    - "````````````````" - "````````````````" -- "Note that brackets that *aren't*" -- " part of links do not take\nprecedence:" +- "Note that brackets that *aren't* part of links do not take" +- "precedence:" - "````````````````" - "````````````````" - example @@ -6357,14 +6378,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "There are three kinds of [reference link](@)s:" -- "[full](#full-reference-link), [collapsed](" -- "#collapsed-reference-link),\nand" +- "[full](#full-reference-link)," +- "[collapsed](#collapsed-reference-link),\nand" - "[shortcut](#shortcut-reference-link)." -- "A [full reference link](@)" -- "consists of a [link text] immediately followed by a [link label]\nthat" -- "[matches] a [link reference definition] elsewhere in the document." -- "A [link label](@) begins with a left bracket (`[" -- "`) and ends\nwith the first right bracket (`]`" +- "A [full reference link](@)\nconsists of a [link text]" +- " immediately followed by a [link label]\nthat [matches] a [" +- "link reference definition] elsewhere in the document." +- "A [link label](@) begins with a left bracket (" +- "`[`) and ends\nwith the first right bracket (`]`" - ) that is not backslash-escaped. - "Between these brackets there must be at least one character that is not a space," - "tab, or line ending." @@ -6374,8 +6395,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One label [matches](@)" - another just in case their normalized forms are equal. To normalize a - "label, strip off the opening and closing brackets,\nperform the" -- "*Unicode case fold*" -- ", strip leading and trailing" +- "*Unicode case fold*, strip leading and trailing" - "spaces, tabs, and line endings, and collapse consecutive internal" - "spaces, tabs, and line endings to a single space." - "If there are multiple\nmatching reference link definitions, the one that comes first in the" @@ -6574,12 +6594,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]. If whitespace is allowed between the" - "link text and the link label, then in the following we will have" - "a single reference link, not two shortcut reference links, as\nintended:" -- "``` markdown\n[foo]\n[bar]" -- "[foo]: /url1" -- "[bar]: /url2\n```\n\n(Note that [" -- "shortcut reference links] were introduced by Gruber" -- "himself in a beta version of `Markdown.pl`, but never" -- "included\nin the official syntax description. Without shortcut reference" +- "``` markdown" +- "[foo]\n[bar]\n\n[foo]: /url1" +- "[bar]: /url2\n```" +- "(Note that [shortcut reference links] were introduced by Gruber" +- "himself in a beta version of `Markdown.pl`" +- ", but never included\nin the official syntax description. Without shortcut reference" - "links, it is harmless to allow space between the link text and" - "link label; but once shortcut references are introduced, it is" - "too dangerous to allow this, as it frequently leads to" @@ -6655,8 +6675,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\\

    " - "````````````````" - "````````````````" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:" +- "A [link label]" +- "must contain at least one character that is not a space, tab, or" +- "line ending:" - "````````````````" - "````````````````" - example @@ -6673,14 +6694,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]: /uri

    " - "````````````````" - "````````````````" -- "A [collapsed reference link](@)" -- "consists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument, followed by the string `[]`" -- ".\nThe contents of the link label are parsed as inlines," +- "A [collapsed reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the" +- "document, followed by the string `[]`." +- "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link'" - "s URI and title are\nprovided by the matching reference link definition. Thus," -- "`[foo][]` is equivalent to `[foo]" -- "[foo]`." +- "`[foo][]` is equivalent to" +- "`[foo][foo]`." - "````````````````" - "````````````````" - example @@ -6721,15 +6742,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo\n[]

    " - "````````````````" - "````````````````" -- "A [shortcut reference link](@)" -- "consists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument and is not followed by `[]`" -- or a link label. +- "A [shortcut reference link](@)\nconsists of a [link label" +- "] that [matches] a\n[link reference definition] elsewhere in the" +- "document and is not followed by `[]` or a link label." - "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link'" - "s URI and title\nare provided by the matching link reference definition.\nThus," -- "`[foo]` is equivalent to `[foo][]" -- "`." +- "`[foo]` is equivalent to" +- "`[foo][]`." - "````````````````" - "````````````````" - example @@ -6839,8 +6859,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo(not a link)

    - "````````````````" - "````````````````" -- "In the following case `[bar][baz]` is parsed" -- "as a reference,\n`[foo]` as normal text:" +- "In the following case `[bar][baz]`" +- " is parsed as a reference,\n`[foo]`" +- "as normal text:" - "````````````````" - "````````````````" - example @@ -6850,8 +6871,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url\">bar

    " - "````````````````" - "````````````````" -- "Here, though, `[foo][bar]` is parsed" -- "as a reference, since\n`[bar]` is defined:" +- "Here, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:" - "````````````````" - "````````````````" - example @@ -6863,9 +6884,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url1\">baz

    " - "````````````````" - "````````````````" -- "Here `[foo]` is not parsed as a shortcut reference" -- ", because it\nis followed by a link label (even though" -- "`[bar]` is not defined):" +- "Here `[foo]`" +- "is not parsed as a shortcut reference, because it" +- "is followed by a link label (even though `[bar]`" +- "is not defined):" - "````````````````" - "````````````````" - example @@ -6879,11 +6901,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Images" - "Syntax for images is like the syntax for links, with one" - "difference. Instead of [link text], we have an" -- "[image description](@)" -- ". The rules for this are the\nsame as for [link text]" -- ", except that (a) an\nimage description starts with `![`" -- "rather than `[`" -- ", and\n(b) an image description may contain links." +- "[image description](@). The rules for this are the" +- "same as for [link text], except that (a) an" +- "image description starts with `![` rather than `[`, and" +- (b) an image description may contain links. - An image description has inline elements - "as its contents. When an image is rendered to HTML," - "this is standardly used as the image's `alt` attribute." @@ -6928,8 +6949,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Though this spec is concerned with parsing, not rendering, it is" - "recommended that in rendering to HTML, only the plain string content\nof the" - "[image description] be used. Note that in" -- "the above example, the alt attribute's value is `foo bar" -- "`, not `foo\n[bar](/url)` or" +- "the above example, the alt attribute's value is" +- "`foo bar`, not" +- "`foo\n[bar](/url)` or" - "`foo bar" - "`. Only the plain string" - "content is rendered, without formatting." @@ -7114,28 +7136,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Autolinks" -- "[Autolink](@)s are absolute URIs and email addresses" -- "inside\n`<` and `>`" +- "[Autolink](@)" +- "s are absolute URIs and email addresses inside\n`<` and" +- "`>`" - ". They are parsed as links, with the URL or email address" - as the link label. -- "A [URI autolink](@) consists of `<`," -- "followed by an\n[absolute URI] followed by `>`" +- "A [URI autolink](@) consists of `<`" +- ", followed by an\n[absolute URI] followed by `>`" - ". It is parsed as" -- "a link to the URI, with the URI as the link's" -- label. -- "An [absolute URI](@)" -- ",\nfor these purposes, consists of a [scheme]" -- "followed by a colon (`:`" -- ")\nfollowed by zero or more characters other than [ASCII control\ncharacters]" -- "[ASCII control character], [space], `<`," -- "and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded" +- "a link to the URI, with the URI as the link'" +- s label. +- "An [absolute URI](@)," +- "for these purposes, consists of a [scheme] followed by a colon (" +- "`:`)\nfollowed by zero or more characters other than [" +- "ASCII control\ncharacters][ASCII control character], [space" +- "], `<`, and `>`." +- "If the URI includes these characters, they must be percent-encoded" - "(e.g. `%20` for a space)." -- "For purposes of this spec, a [scheme](@) is any" -- "sequence\nof 2--32 characters beginning with an ASCII letter and followed" +- "For purposes of this spec, a [scheme](@)" +- " is any sequence\nof 2--" +- 32 characters beginning with an ASCII letter and followed - "by any combination of ASCII letters, digits, or the symbols plus\n(" -- "\"+\"), period (\".\"), or" -- "hyphen (\"-\")." +- "\"+\"), period (\".\"" +- "), or hyphen (\"-\")." - "Here are some valid autolinks:" - "````````````````" - "````````````````" @@ -7245,8 +7268,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is" - "`mailto:` followed by the email address." -- "An [email address](@)" -- ",\nfor these purposes, is anything that matches\nthe" +- "An [email address](@)," +- "for these purposes, is anything that matches\nthe" - "[non-normative regex from the HTML5\nspec" - "](https://html.spec.whatwg.org" - "/multipage/forms.html#e-mail-state-(type" @@ -7337,57 +7360,57 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Raw HTML" -- "Text between `<` and `>` that looks like an HTML" -- tag is parsed as a +- "Text between `<` and `>`" +- that looks like an HTML tag is parsed as a - raw HTML tag and will be rendered in HTML without escaping. - "Tag and attribute names are not limited to current HTML tags," - "so custom tags (and even, say, DocBook tags) may be" - "used.\n\nHere is the grammar for tags:" -- "A [tag name](@)" -- consists of an ASCII letter +- "A [tag name](@) consists of an ASCII letter" - "followed by zero or more ASCII letters, digits, or" - "hyphens (`-`)." -- "An [attribute](@) consists of spaces, tabs, and up" -- "to one line ending,\nan [attribute name], and an optional\n[" -- "attribute value specification]." +- "An [attribute](@)" +- " consists of spaces, tabs, and up to one line ending,\nan [" +- "attribute name], and an optional\n[attribute value specification]." - "An [attribute name](@)\nconsists of an ASCII letter," -- "`_`, or `:`, followed by zero or more" -- "ASCII\nletters, digits, `_`, `.`," -- "`:`, or `-`" +- "`_`, or `:`" +- ", followed by zero or more ASCII\nletters, digits, `_`" +- ", `.`, `:`, or `-`" - ". (Note: This is the XML" - specification restricted to ASCII. - HTML5 is laxer.) - "An [attribute value specification](@)" - "consists of optional spaces, tabs, and up to one line ending,\na" -- "`=` character, optional spaces, tabs, and up to one line" -- "ending,\nand an [attribute value]." -- "An [attribute value](@)" -- "consists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value]" -- "." +- "`=`" +- "character, optional spaces, tabs, and up to one line ending," +- "and an [attribute value]." +- "An [attribute value](@)\nconsists of an [" +- "unquoted attribute value],\na [single-quoted attribute value]" +- ", or a [double-quoted attribute value]." - "An [unquoted attribute value](@)" - is a nonempty string of characters not - "including spaces, tabs, line endings, `\"`, `'`" -- ", `=`, `<`, `>`, or ``" -- "` ``." +- ", `=`, `<`, `>`, or" +- "`` ` ``." - "A [single-quoted attribute value](@)\nconsists of `'`" - ", zero or more\ncharacters not including `'`, and a final" - "`'`." - "A [double-quoted attribute value](@)\nconsists of `\"`" - ", zero or more\ncharacters not including `\"`, and a final" - "`\"`." -- "An [open tag](@) consists of a `<` character," -- "a [tag name],\nzero or more [attributes]" +- "An [open tag](@) consists of a `<`" +- " character, a [tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional" - "`/` character, and a `>` character." - "A [closing tag](@) consists of the string ``." -- "An [HTML comment](@) consists of ``, ``, or `<" -- "!--`, a string of\ncharacters not including the string" -- "`-->`, and `-->` (see the" +- "An [HTML comment](@) consists of" +- "``, ``" +- ", or ``, and" +- "`-->` (see the" - "[HTML spec](https://" - html.spec.whatwg.org/multipage/ - "parsing.html#markup-declaration-open-state))." @@ -7603,9 +7626,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - A line ending (not in a code span or HTML tag) that - is preceded - by two or more spaces and does not occur at the end of a block -- "is parsed as a [hard line break](@)" -- " (rendered\nin HTML as a `
    `" -- "tag):" +- "is parsed as a [hard line break](@) (rendered" +- "in HTML as a `
    ` tag):" - "````````````````" - "````````````````" - example @@ -7778,7 +7800,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In this appendix we describe some features of the parsing strategy - used in the CommonMark reference implementations. - "## Overview\n\nParsing has two phases:" -- "1. In the first phase, lines of input are consumed and the block" +- "1." +- "In the first phase, lines of input are consumed and the block" - "structure of the document---its division into paragraphs, block quotes," - "list items, and so on---" - is constructed. Text is assigned to these @@ -7792,17 +7815,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "At each point in processing, the document is represented as a tree of" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as" -- "**children**" -- ". These children" +- "**children**. These children" - "may, in turn, have other blocks as children." - "The last child of a block\nis normally considered **open**" - ", meaning that subsequent lines of input" - can alter its contents. (Blocks that are not open are -- "**closed**" -- ".)" +- "**closed**.)" - "Here, for example, is a possible document tree, with the open blocks" - "marked by arrows:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph" @@ -7821,25 +7843,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Once a line has been incorporated into the tree in this way," - "it can be discarded, so input can be read in a stream." - "For each line, we follow this procedure:" -- "1. First we iterate through the open blocks, starting with the" +- "1." +- "First we iterate through the open blocks, starting with the" - "root document, and descending through last children down to the last" - open block. Each block imposes a condition that the line must satisfy - "if the block is to remain open. For example, a block quote requires a" -- "`>`" -- character. A paragraph requires a non-blank line. +- "`>` character. A paragraph requires a non-blank line." - In this phase we may match all or just some of the open - blocks. - "But we cannot close unmatched blocks yet, because we may have a\n[" - "lazy continuation line]." -- "2. Next, after consuming the continuation markers for existing" +- "2." +- "Next, after consuming the continuation markers for existing" - "blocks, we look for new block starts (e.g. `>`" - for a block quote). - "If we encounter a new block start, we close any blocks unmatched" - in step 1 before creating the new block as a child of the last - matched container block. -- "3. Finally, we look at the remainder of the line (after block" -- "markers like `>`, list markers, and indentation have been consumed" -- ").\nThis is text that can be incorporated into the last open" +- "3." +- "Finally, we look at the remainder of the line (after block\nmarkers like" +- "`>`, list markers, and indentation have been consumed)." +- This is text that can be incorporated into the last open - "block (a paragraph, code block, heading, or raw HTML)" - "." - Setext headings are formed when we see a line of a paragraph @@ -7849,37 +7873,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one or more reference link definitions. Any remainder becomes a\nnormal paragraph." - We can see how this works by considering how the tree above is - "generated by four lines of Markdown:" -- "``` markdown\n> Lorem ipsum dolor" -- sit amet. +- "``` markdown" +- "> Lorem ipsum dolor\nsit amet." - "> - Qui *quodsi iracundia*" - "> - aliquando id\n```" - "At the outset, our document model is just" - "``` tree\n-> document\n```" - "The first line of our text," - "``` markdown\n> Lorem ipsum dolor\n```" -- "causes a `block_quote`" -- " block to be created as a child of our\nopen `document`" -- " block, and a `paragraph` block as a child of\nthe" -- "`block_quote`" -- ". Then the text is added to the last open\nblock, the `paragraph`" -- ":" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "causes a `block_quote` block to be created as a child of our" +- "open `document` block, and a `paragraph` block as a child of" +- "the `block_quote`. Then the text is added to the last open" +- "block, the `paragraph`:" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\"\n```\n\nThe next line," - "``` markdown\nsit amet.\n```" - "is a \"lazy continuation\" of the open `paragraph`" - ", so it gets added\nto the paragraph's text:" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "```\n\nThe third line," - "``` markdown" - "> - Qui *quodsi iracundia*" - "```" - "causes the `paragraph` block to be closed, and a new `list`" -- "block\nopened as a child of the `block_quote`. A" +- " block\nopened as a child of the `block_quote`. A" - "`list_item` is also\nadded as a child of the `list`" - ", and a `paragraph` as a child of\nthe `list_item`" - ". The text is then added to the new `paragraph`:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-" - ")\n -> list_item\n -> paragraph" @@ -7891,7 +7916,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "opened up as child of the `list`. A `paragraph`" - "is added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph" @@ -7904,15 +7930,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - string contents of paragraphs and headings as inlines. At this - "point we have seen all the link reference definitions, so we can" - resolve reference links as we go. -- "``` tree\ndocument\n block_quote\n paragraph" -- " str \"Lorem ipsum dolor\"\n softbreak" -- "str \"sit amet.\"" +- "``` tree" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"" +- " softbreak\n str \"sit amet.\"" - list (type=bullet tight=true bullet_char=-) - " list_item\n paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item\n paragraph" -- " str \"aliquando id\"\n```\n\nNotice how the" -- "[line ending] in the first paragraph has\nbeen parsed as a" -- "`softbreak`" +- " str \"aliquando id\"\n```" +- "Notice how the [line ending] in the first paragraph has" +- "been parsed as a `softbreak`" - ", and the asterisks in the first list item\nhave become an" - "`emph`." - "### An algorithm for parsing nested emphasis and links" @@ -7922,20 +7948,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- a run of `*` or `_` characters, or" - "- a `[` or `![`" - "we insert a text node with these symbols as its literal content, and we" -- "add a pointer to this text node to the [delimiter stack]" -- (@). +- add a pointer to this text node to the +- "[delimiter stack](@)." - "The [delimiter stack] is a doubly linked list. Each" - "element contains a pointer to a text node, plus information about" -- "- the type of delimiter (`[`, `![" -- "`, `*`, `_`)" +- "- the type of delimiter (`[`, `![`" +- ", `*`, `_`)" - "- the number of delimiters," -- "- whether the delimiter is \"active\" (all are active to start" -- "), and" +- "- whether the delimiter is \"active\"" +- "(all are active to start), and" - "- whether the delimiter is a potential opener, a potential closer," - or both (which depends on what sort of characters precede - and follow the delimiters). -- "When we hit a `]` character, we call the *look for link" -- "or image*\nprocedure (see below)." +- "When we hit a `]` character, we call the" +- "*look for link or image*\nprocedure (see below)." - "When we hit the end of the input, we call the *process emphasis*" - "procedure (see below), with `stack_bottom`" - "= NULL." @@ -7945,57 +7971,68 @@ input_file: tests/inputs/markdown/commonmark_spec.md - delimiter. - "- If we don't find one, we return a literal text node `" - "]`." -- "- If we do find one, but it's not *active*," -- we remove the inactive +- "-" +- "If we do find one, but it's not *active*" +- ", we remove the inactive" - "delimiter from the stack, and return a literal text node `]`" - "." -- "- If we find one and it's active, then we parse ahead" -- to see if +- "-" +- "If we find one and it'" +- "s active, then we parse ahead to see if" - "we have an inline link/image, reference link/image, collapsed reference" - "link/image, or shortcut reference link/image." -- "+ If we don'" -- "t, then we remove the opening delimiter from the" +- + +- "If we don't, then we remove the opening delimiter from the" - "delimiter stack and return a literal text node `]`." - "+ If we do, then" -- "* We return a link or image node whose children are the inlines" +- "*" +- We return a link or image node whose children are the inlines - after the text node pointed to by the opening delimiter. -- "* We run *process emphasis* on these inlines, with the `[" -- "` opener\n as `stack_bottom`." +- "*" +- "We run *process emphasis* on these inlines, with the `[`" +- " opener\n as `stack_bottom`." - "* We remove the opening delimiter." -- "* If we have a link (and not an image), we also set" -- "all\n `[` delimiters before the opening delimiter to" +- "*" +- "If we have a link (and not an image), we also set all" +- "`[` delimiters before the opening delimiter to" - "*inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*" -- "Parameter `stack_bottom`" -- " sets a lower bound to how far we\ndescend in the [delimiter stack" -- "]. If it is NULL, we can" +- "Parameter `stack_bottom` sets a lower bound to how far we" +- "descend in the [delimiter stack]" +- ". If it is NULL, we can" - "go all the way to the bottom. Otherwise, we stop before\nvisiting" - "`stack_bottom`." -- "Let `current_position` point to the element on the [delimiter" -- "stack]\njust above `stack_bottom` (or the first element if" -- "`stack_bottom`\nis NULL)." -- "We keep track of the `openers_bottom`" -- " for each delimiter\ntype (`*`, `_`" +- "Let `current_position` point to the element on the [" +- "delimiter stack]\njust above `stack_bottom`" +- "(or the first element if `stack_bottom`" +- is NULL). +- "We keep track of the `openers_bottom` for each delimiter" +- "type (`*`, `_`" - "), indexed to the length of the closing delimiter run" - (modulo 3) and to whether the closing delimiter can also - "be an\nopener. Initialize this to `stack_bottom`." - "Then we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if" -- "needed)\n until we find the first potential closer with delimiter `*`" -- "or `_`" -- ".\n (This will be the potential closer closest\n to the beginning of the input" -- "-- the first one in parse order.)" - "-" -- "Now, look back in the stack (staying above `stack_bottom`" -- " and\n the `openers_bottom`" -- " for this delimiter type) for the\n first matching potential opener (\"matching" -- "\" means same delimiter).\n\n- If one is found:" -- "+ Figure out whether we have emphasis or strong emphasis:" +- "Move `current_position`" +- forward in the delimiter stack (if needed) +- "until we find the first potential closer with delimiter `*` or" +- "`_`.\n (This will be the potential closer closest" +- to the beginning of the input -- +- the first one in parse order.) +- "-" +- "Now, look back in the stack (staying above `stack_bottom` and" +- "the `openers_bottom` for this delimiter type) for the" +- "first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:" +- + +- "Figure out whether we have emphasis or strong emphasis:" - "if both closer and opener spans have length >= 2, we have" - "strong, otherwise regular." -- "+ Insert an emph or strong emph node accordingly, after" +- + +- "Insert an emph or strong emph node accordingly, after" - the text node corresponding to the opener. -- + Remove any delimiters between the opener and closer from +- + +- Remove any delimiters between the opener and closer from - the delimiter stack. - + - Remove 1 (for regular emph) or 2 (for strong @@ -8011,10 +8048,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - (We know that there are no openers for this kind of closer up to - "and\n including this point, so this puts a lower bound on future searches.)" - + -- "If the closer at `current_position`" -- "is not a potential opener," +- "If the closer at `current_position` is not a potential opener," - "remove it from the delimiter stack (since we know it can't" - be a closer either). - "+ Advance `current_position` to the next element in the stack." -- "After we're done, we remove all delimiters above `" -- "stack_bottom` from the\ndelimiter stack." +- "After we're done, we remove all delimiters above" +- "`stack_bottom` from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap index 9e03da0..b5da2d9 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap @@ -6,30 +6,35 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" +- "```" +- "1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item." - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------" - "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------" - "# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------" - "# Links" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org" -- "[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later." +- "```" +- "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com" +- "```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later." - "[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------" - "# Images" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:" +- "```" +- "Here's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:" - "[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax" - "![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" - "```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;" +- "```php" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;" - " $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" - "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |" -- "| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns." -- "| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |" -- "| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" +- "```" +- "Colons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |" +- "| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |" +- "There must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" +- "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote." - "> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------" - "# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap index 1a85b9e..cbdd02a 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap @@ -4,9 +4,9 @@ expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers" -- "```\n# h1 Heading 8-)" -- "## h2 Heading\n### h3 Heading" -- "#### h4 Heading" +- "```" +- "# h1 Heading 8-)\n## h2 Heading" +- "### h3 Heading\n#### h4 Heading" - "##### h5 Heading" - "###### h6 Heading" - "Alternatively, for H1 and H2, an underline-ish style" @@ -38,15 +38,16 @@ input_file: tests/inputs/markdown/github_flavored.md - _underscores_. - "Strong emphasis, aka bold, with **asterisks** or" - __underscores__. -- Combined emphasis with **asterisks and _underscores_* -- "*." -- Strikethrough uses two tildes. ~~Scratch this. -- "~~\n\n**This is bold text**" -- "__This is bold text__\n\n*This is italic text*" -- "_This is italic text_\n\n~~Strikethrough~~" -- "------" +- Combined emphasis with +- "**asterisks and _underscores_**." +- Strikethrough uses two tildes. +- ~~Scratch this.~~ +- "**This is bold text**\n\n__This is bold text__" +- "*This is italic text*\n\n_This is italic text_" +- "~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item" +- "```" +- "1. First ordered list item\n2. Another item" - "⋅⋅* Unordered sub-list.\n1." - "Actual numbers don't matter, just that it's a number" - "⋅⋅1. Ordered sub-list\n4. And another item." @@ -83,8 +84,8 @@ input_file: tests/inputs/markdown/github_flavored.md - ⋅⋅⋅You can have properly indented paragraphs within list items - "." - "Notice the blank line above, and the leading spaces (at least one," -- "but we'll use three here to also align the raw Markdown)" -- "." +- "but we'" +- ll use three here to also align the raw Markdown). - "⋅⋅⋅To have a line break without a paragraph, you will need" - to use two trailing spaces.⋅⋅ - "⋅⋅⋅Note that this line is separate, but within the same paragraph" @@ -93,10 +94,12 @@ input_file: tests/inputs/markdown/github_flavored.md - where trailing spaces are not required.) - "* Unordered list can use asterisks\n- Or minuses" - + Or pluses -- "1. Make my changes\n 1. Fix bug\n 2." -- "Improve formatting\n - Make the headings bigger" +- 1. Make my changes +- 1. Fix bug +- " 2. Improve formatting\n - Make the headings bigger" - 2. Push my commits to GitHub -- "3. Open a pull request\n * Describe my changes" +- 3. Open a pull request +- "* Describe my changes" - " * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-" - "`, or `*`" @@ -107,7 +110,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "- Nulla volutpat aliquam velit" - "+ Very easy!\n\n------" - "# Task lists" -- "```\n- [x] Finish my changes" +- "```" +- "- [x] Finish my changes" - "- [ ] Push my commits to GitHub" - "- [ ] Open a pull request" - "- [x] @mentions, #refs, [links]()" @@ -119,12 +123,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [x] Finish my changes" - "- [ ] Push my commits to GitHub" - "- [ ] Open a pull request" -- "- [x] @mentions, #refs, [links]()" -- ", **formatting**, and tags supported" -- "- [x] list syntax required (any unordered or ordered list supported" -- ")\n- [ ] this is a complete item" -- "- [ ] this is an incomplete item\n\n------" +- "-" +- "[x] @mentions, #refs, [links]()," +- "**formatting**, and tags" +- supported +- "-" +- "[x] list syntax required (any unordered or ordered list supported)" +- "- [ ] this is a complete item\n- [ ] this is an incomplete item" +- "------" - "# Ignoring Markdown formatting" - You can tell GitHub to ignore (or escape) Markdown - "formatting by using \\ before the Markdown character." @@ -170,8 +176,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "Or leave it empty and use the [link text itself]." - URLs and URLs in angle brackets will automatically get turned into links - ".\nhttp://www.example.com or" -- "" -- and sometimes +- " and sometimes" - "example.com (but not on Github, for example)." - Some text to show that the reference links can follow later. - "[arbitrary case-insensitive reference text]: https://" @@ -206,31 +211,35 @@ input_file: tests/inputs/markdown/github_flavored.md - "Here's our logo (hover to see the title text):" - "Inline-style:" - "![" -- "alt text](https://github.com/" -- adam-p/markdown-here/raw/master/src -- "/common/images/icon48.png \"Logo Title Text 1" -- "\")\n\nReference-style:\n![alt text][logo]" +- alt text +- "](https://github.com/adam-p" +- /markdown-here/raw/master/src/common/images +- "/icon48.png \"Logo Title Text 1\")" +- "Reference-style:\n![alt text][logo]" - "[logo]: https://github.com/adam" - "-p/markdown-here/raw/master/src/common" - "/images/icon48.png \"Logo Title Text 2\"" - "![" -- "Minion](https://" -- octodex.github.com/images/ -- minion.png) +- Minion +- "](https://octodex.github.com" +- /images/minion.png) - "![" -- "Stormtroopocat](https://" -- octodex.github.com/images/ -- "stormtroopocat.jpg \"The Stormtroopocat" -- "\")\n\nLike links, Images also have a footnote style syntax" +- Stormtroopocat +- "](https://octodex.github.com" +- "/images/stormtroopocat.jpg \"The" +- "Stormtroopocat\")" +- "Like links, Images also have a footnote style syntax" - "![Alt text][id]" - "With a reference later in the document defining the URL location:" - "[id]: https://" - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"" - "------" -- "# [Footnotes](https://github.com/" +- "#" +- "[Footnotes](https://github.com/" - markdown-it/markdown-it-footnote) -- "```\nFootnote 1 link[^first]." +- "```" +- "Footnote 1 link[^first]." - "Footnote 2 link[^second]." - "Inline footnote^[Text of inline footnote] definition." - "Duplicated footnote reference[^second]." @@ -248,7 +257,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Inline `code` has `back-ticks around` it." - "```" - "Inline `code` has `back-ticks around` it." -- "```c#\nusing System.IO.Compression;" +- "```c#" +- using System.IO.Compression; - "#pragma warning disable 414, 3021" - "namespace MyApplication\n{" - "[Obsolete(\"...\")]" @@ -259,7 +269,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "\");" - "return new List(new int[] { 1," - "2, 3 })\n }\n }\n}\n```" -- "```css\n@font-face {" +- "```css" +- "@font-face {" - "font-family: Chunkfive; src: url('" - "Chunkfive.otf');\n}" - "body, .usertext {" @@ -310,7 +321,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "__halt_compiler () ; datahere\ndatahere\ndatahere */" - "datahere\n```\n\n------" - "# Tables" -- "```\nColons can be used to align columns." +- "```" +- Colons can be used to align columns. - "| Tables | Are | Cool |" - "| ------------- |:" - "-------------:| -" @@ -342,7 +354,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n| --- | --- |" - "| Backtick | ` |\n| Pipe | \\| |\n```" - Colons can be used to align columns. -- "| Tables | Are | Cool |" +- "| Tables | Are | Cool" +- "|" - "| ------------- |:" - "-------------:| -" - "----:|" @@ -350,15 +363,16 @@ input_file: tests/inputs/markdown/github_flavored.md - "| col 2 is | centered | $12 |" - "| zebra stripes | are neat | $1 |" - There must be at least 3 dashes separating each header cell. -- "The outer pipes (|) are optional, and you don't need to" -- make the +- "The outer pipes (|) are optional, and you don'" +- t need to make the - raw Markdown line up prettily. - You can also use inline Markdown. - Markdown | Less | Pretty - "--- | --- | ---" - "*Still* | `renders` | **nicely**" - 1 | 2 | 3 -- "| First Header | Second Header |" +- "| First Header | Second Header" +- "|" - "| ------------- | -" - "------------ |" - "| Content Cell | Content Cell |\n| Content Cell | Content Cell |" @@ -367,9 +381,10 @@ input_file: tests/inputs/markdown/github_flavored.md - "| git diff | Show file differences that haven't been staged |" - "| Command | Description |\n| --- | --- |" - "| `git status` | List all *new or modified* files |" -- "| `git diff` | Show file differences that **" -- "haven't been** staged |" -- "| Left-aligned | Center-aligned | Right-aligned |" +- "| `git diff` |" +- "Show file differences that **haven't been** staged |" +- "| Left-aligned | Center-aligned | Right-aligned" +- "|" - "| :--- | :---: | ---:" - "|\n| git status | git status | git status |" - "| git diff | git diff | git diff |" @@ -390,15 +405,18 @@ input_file: tests/inputs/markdown/github_flavored.md - ">> ...by using additional greater-than signs right next to each" - "other...\n> > > ...or with spaces between arrows." - "```" -- "> Blockquotes are very handy in email to emulate reply text." -- "> This line is part of the same quote.\n\nQuote break." -- "> This is a very long line that will still be quoted properly when it wraps" -- ". Oh boy let'" +- ">" +- "Blockquotes are very handy in email to emulate reply text.\n>" +- "This line is part of the same quote.\n\nQuote break." +- ">" +- This is a very long line that will still be quoted properly when it wraps. +- "Oh boy let'" - s keep writing to make sure this is long enough to actually wrap for everyone. -- "Oh, you can *put* **Markdown** into a" -- blockquote. +- "Oh, you can *put* **Markdown**" +- into a blockquote. - "> Blockquotes can also be nested...\n>" -- "> ...by using additional greater-than signs right next to each other" +- ">" +- "...by using additional greater-than signs right next to each other" - "...\n> > > ...or with spaces between arrows." - "------" - "# Inline HTML" @@ -456,7 +474,8 @@ input_file: tests/inputs/markdown/github_flavored.md - www.youtube.com/watch? - v=YOUTUBE_VIDEO_ID_HERE) - "```" -- "[![IMAGE ALT TEXT HERE" +- "[![" +- IMAGE ALT TEXT HERE - "](https://upload.wikimedia.org/" - wikipedia/commons/thumb/e/ef/ - YouTube_logo_2015.svg/1200px- diff --git a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap index b5aae28..c48a1a8 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap @@ -8,17 +8,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Introduction\n\n" - "## What is Markdown?\n\n" - "Markdown is a plain text format for writing structured documents,\nbased on conventions for indicating formatting in email\nand usenet posts. It was developed by John Gruber (with\nhelp from Aaron Swartz) and released in 2004 in the form of a\n" -- "[syntax description](https://daringfireball.net/projects/markdown/syntax)\nand a Perl script (`Markdown.pl`" -- ") for converting Markdown to\nHTML. In the next decade, dozens of implementations were\ndeveloped in many languages. Some extended the original\nMarkdown syntax with conventions for footnotes, tables, and\n" -- "other document elements. Some allowed Markdown documents to be\nrendered in formats other than HTML. Websites like Reddit,\nStackOverflow, and GitHub had millions of people using Markdown.\nAnd Markdown started to be used beyond the web, to author books,\n" -- "articles, slide shows, letters, and lecture notes.\n\nWhat distinguishes Markdown from many other lightweight markup\nsyntaxes, which are often easier to write, is its readability.\nAs Gruber writes:\n\n" -- "> The overriding design goal for Markdown's formatting syntax is\n> to make it as readable as possible. The idea is that a\n> Markdown-formatted document should be publishable as-is, as\n> plain text, without looking like it's been marked up with tags\n> " +- "[syntax description](https://daringfireball.net/projects/markdown/syntax)\nand a Perl script (`Markdown.pl`) for converting Markdown to\nHTML. In the next decade, dozens of implementations were\ndeveloped in many languages. Some extended the original\n" +- "Markdown syntax with conventions for footnotes, tables, and\nother document elements. Some allowed Markdown documents to be\nrendered in formats other than HTML. Websites like Reddit,\nStackOverflow, and GitHub had millions of people using Markdown.\n" +- "And Markdown started to be used beyond the web, to author books,\narticles, slide shows, letters, and lecture notes.\n\n" +- "What distinguishes Markdown from many other lightweight markup\nsyntaxes, which are often easier to write, is its readability.\nAs Gruber writes:\n\n" +- "> " +- "The overriding design goal for Markdown's formatting syntax is\n> to make it as readable as possible. The idea is that a\n> Markdown-formatted document should be publishable as-is, as\n> plain text, without looking like it's been marked up with tags\n> " - "or formatting instructions.\n> ()\n\n" - "The point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n" -- "\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. " +- "\n```\n" +- "1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. " - "List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\n" - "This paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:\n" -- "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n" +- "```\n" +- "1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n" - " This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n" - " This paragraph belongs to item two of the outer list.\n```\n\n" - "The AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document.\n\n" @@ -27,17 +30,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but " - "`Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See " - "[this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n" -- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n " -- "also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken " -- "[in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n" +- "2. " +- "Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations" +- "\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n" - "3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n" - "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```" - "\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```" - "\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n" - "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n" - "6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)" -- "\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n" +- "\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n" - "8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n" - "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n" - "10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n" @@ -48,15 +51,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\n" - "renders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away.\n\n" - "## About this document\n\n" -- "This document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py`" -- " can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n" +- "This document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\n" +- "against any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n" - "\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\n" - "of representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer.\n\n" - "Note that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\n" - "implementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\n" - "percent-encode non-ASCII characters in URLs.\n\n" -- "This document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt`" -- " into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs.\n\n" +- "This document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\n" +- "HTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs.\n\n" - "# Preliminaries\n\n" - "## Characters and lines\n\nAny sequence of [characters] is a valid CommonMark\ndocument.\n" - "\nA [character](@) is a Unicode code point. Although some\ncode points (for example, combining accents) do not correspond to\ncharacters in an intuitive sense, all code points count as characters\nfor purposes of this spec.\n" @@ -76,9 +79,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\nIn the following example, a continuation paragraph of a list\nitem is indented with a tab; this has exactly the same effect\nas indentation with four spaces would:\n" - "\n```````````````````````````````` example\n - foo\n\n→bar\n.\n

      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````" - "\n\n```````````````````````````````` example\n- foo\n\n→→bar\n.\n
      \n
    • \n

      foo

      \n
        bar\n
      \n
    • \n
    \n````````````````````````````````" -- "\n\nNormally the `>` that begins a block quote may be followed\noptionally by a space, which is not considered part of the\ncontent. In the following case `>`" -- " is followed by a tab,\nwhich is treated as if it were expanded into three spaces.\nSince one of these spaces is considered part of the\ndelimiter, `foo`" -- " is considered to be indented six spaces\ninside the block quote context, so we get an indented\ncode block starting with two spaces.\n\n" +- "\n\nNormally the `>` that begins a block quote may be followed\noptionally by a space, which is not considered part of the\ncontent. In the following case `>` is followed by a tab,\nwhich is treated as if it were expanded into three spaces.\n" +- "Since one of these spaces is considered part of the\ndelimiter, `foo` is considered to be indented six spaces\ninside the block quote context, so we get an indented\ncode block starting with two spaces.\n\n" - "```````````````````````````````` example\n>→→foo\n.\n
    \n
      foo\n
    \n
    \n````````````````````````````````" - "\n\n```````````````````````````````` example\n-→→foo\n.\n
      \n
    • \n
        foo\n
      \n
    • \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n foo\n→bar\n.\n
    foo\nbar\n
    \n````````````````````````````````" @@ -89,9 +91,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n\\!\\\"\\#\\$\\%\\&\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^\\_\\`\\{\\|\\}\\~\n.\n

    !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~

    \n````````````````````````````````" - "\n\n\nBackslashes before other characters are treated as literal\nbackslashes:\n\n```````````````````````````````` example\n\\→\\A\\a\\ \\3\\φ\\«\n.\n

    \\→\\A\\a\\ \\3\\φ\\«

    \n````````````````````````````````" - "\n\n\nEscaped characters are treated as regular characters and do\nnot have their usual Markdown meanings:\n" -- "\n```````````````````````````````` example\n\\*not emphasized*\n\\
    not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\ö not a character entity\n.\n

    *not emphasized*\n" -- "<br/> not a tag\n[not a link](/foo)\n`not code`\n1. not a list\n* not a list\n# not a heading\n[foo]: /url "not a reference"\n&ouml; not a character entity

    \n````````````````````````````````\n\n\n" -- "If a backslash is itself escaped, the following character is not:\n\n```````````````````````````````` example\n\\\\*emphasis*\n.\n

    \\emphasis

    \n````````````````````````````````\n\n\nA backslash at the end of the line is a [hard line break]:\n" +- "\n```````````````````````````````` example\n" +- "\\*not emphasized*\n\\
    not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\ö not a character entity\n.\n

    *not emphasized*\n<br/> not a tag\n[not a link](/foo)\n`not code`\n" +- "1. not a list\n* not a list\n# not a heading\n[foo]: /url "not a reference"\n&ouml; not a character entity

    \n````````````````````````````````\n\n\nIf a backslash is itself escaped, the following character is not:\n" +- "\n```````````````````````````````` example\n\\\\*emphasis*\n.\n

    \\emphasis

    \n````````````````````````````````\n\n\nA backslash at the end of the line is a [hard line break]:\n" - "\n```````````````````````````````` example\nfoo\\\nbar\n.\n

    foo
    \nbar

    \n````````````````````````````````\n\n\nBackslash escapes do not work in code blocks, code spans, autolinks, or\nraw HTML:\n" - "\n```````````````````````````````` example\n`` \\[\\` ``\n.\n

    \\[\\`

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n \\[\\]\n.\n
    \\[\\]\n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n~~~\n\\[\\]\n~~~\n.\n
    \\[\\]\n
    \n````````````````````````````````" @@ -109,9 +112,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n  & © Æ Ď\n¾ ℋ ⅆ\n∲ ≧̸\n.\n

      & © Æ Ď\n¾ ℋ ⅆ\n∲ ≧̸

    \n````````````````````````````````" - "\n\n\n[Decimal numeric character\nreferences](@)\nconsist of `&#` + a string of 1--7 arabic digits + `;`. A\nnumeric character reference is parsed as the corresponding\nUnicode character. Invalid Unicode code points will be replaced by\nthe REPLACEMENT CHARACTER (" - "`U+FFFD`). For security reasons,\nthe code point `U+0000` will also be replaced by `U+FFFD`.\n\n```````````````````````````````` example\n# Ӓ Ϡ �\n.\n

    # Ӓ Ϡ �

    \n````````````````````````````````" -- "\n\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#` +\neither `X` or `x` + a string of 1-6 hexadecimal digits + `;`" -- ".\nThey too are parsed as the corresponding Unicode character (this\ntime specified with a hexadecimal numeral instead of decimal).\n\n```````````````````````````````` example\n" ആ ಫ\n.\n

    " ആ ಫ

    \n````````````````````````````````" -- "\n\n\nHere are some nonentities:\n" +- "\n\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#` +\neither `X` or `x` + a string of 1-6 hexadecimal digits + `;`.\nThey too are parsed as the corresponding Unicode character (this\ntime specified with a hexadecimal numeral instead of decimal)." +- "\n\n```````````````````````````````` example\n" ആ ಫ\n.\n

    " ആ ಫ

    \n````````````````````````````````\n\n\nHere are some nonentities:\n" - "\n```````````````````````````````` example\n  &x; &#; &#x;\n�\n&#abcdef0;\n&ThisIsNotDefined; &hi?;\n.\n

    &nbsp &x; &#; &#x;\n&#87654321;\n&#abcdef0;\n&ThisIsNotDefined; &hi?;

    \n````````````````````````````````" - "\n\n\nAlthough HTML5 does accept some entity references\nwithout a trailing semicolon (such as `©`), these are not\nrecognized here, because it makes the grammar too ambiguous:\n" - "\n```````````````````````````````` example\n©\n.\n

    &copy

    \n````````````````````````````````\n\n\nStrings that are not on the list of HTML5 named entities are not\nrecognized as entity references either:\n" @@ -127,9 +129,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n```````````````````````````````` example\n* foo\n\n* foo\n.\n

    * foo

    \n
      \n
    • foo
    • \n
    \n````````````````````````````````\n\n```````````````````````````````` example\nfoo bar\n.\n

    foo\n\nbar

    \n````````````````````````````````" - "\n\n```````````````````````````````` example\n foo\n.\n

    →foo

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[a](url "tit")\n.\n

    [a](url "tit")

    \n````````````````````````````````\n\n\n\n" - "# Blocks and inlines\n\n" -- "We can think of a document as a sequence of\n[blocks](@)" -- "---structural elements like paragraphs, block\nquotations, lists, headings, rules, and code blocks. Some blocks (like\nblock quotes and list items) contain other blocks; others (like\nheadings and paragraphs) contain [inline](@)" -- " content---text,\nlinks, emphasized text, images, code spans, and so on.\n\n" +- "We can think of a document as a sequence of\n[blocks](@)---structural elements like paragraphs, block\nquotations, lists, headings, rules, and code blocks. Some blocks (like\nblock quotes and list items) contain other blocks; others (like\n" +- "headings and paragraphs) contain [inline](@) content---text,\nlinks, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\nIndicators of block structure always take precedence over indicators\nof inline structure. So, for example, the following is a list with\ntwo items, not a list with one item containing a code span:\n" - "\n```````````````````````````````` example\n- `one\n- two`\n.\n
      \n
    • `one
    • \n
    • two`
    • \n
    \n````````````````````````````````" - "\n\n\nThis means that parsing can proceed in two steps: first, the block\nstructure of the document can be discerned; second, text lines inside\nparagraphs, headings, and other block constructs can be parsed for inline\n" @@ -180,13 +181,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\nFoo bar\n# baz\nBar foo\n.\n

    Foo bar

    \n

    baz

    \n

    Bar foo

    \n````````````````````````````````\n\n\nATX headings can be empty:\n" - "\n```````````````````````````````` example\n## \n#\n### ###\n.\n

    \n

    \n

    \n````````````````````````````````\n\n\n" - "## Setext headings\n\n" -- "A [setext heading](@)" -- " consists of one or more\nlines of text, not interrupted by a blank line, of which the first line does not\nhave more than 3 spaces of indentation, followed by\na [setext heading underline]. The lines of text must be such\n" +- "A [setext heading](@) consists of one or more\nlines of text, not interrupted by a blank line, of which the first line does not\nhave more than 3 spaces of indentation, followed by\na [setext heading underline]. The lines of text must be such\n" - "that, were they not followed by the setext heading underline,\nthey would be interpreted as a paragraph: they cannot be\ninterpretable as a [code fence], [ATX heading][ATX headings],\n[block quote][block quotes], [thematic break][thematic breaks],\n[list item" - "][list items], or [HTML block][HTML blocks].\n\nA [setext heading underline](@) is a sequence of\n`=` characters or a sequence of `-` characters, with no more than 3\nspaces of indentation and any number of trailing spaces or tabs.\n" -- "\nThe heading is a level 1 heading if `=` characters are used in\nthe [setext heading underline], and a level 2 heading if `-`" -- "\ncharacters are used. The contents of the heading are the result\nof parsing the preceding lines of text as CommonMark inline\ncontent.\n\n" -- "In general, a setext heading need not be preceded or followed by a\nblank line. However, it cannot interrupt a paragraph, so when a\nsetext heading comes after a paragraph, a blank line is needed between\nthem.\n\nSimple examples:\n" +- "\nThe heading is a level 1 heading if `=` characters are used in\nthe [setext heading underline], and a level 2 heading if `-`\ncharacters are used. The contents of the heading are the result\nof parsing the preceding lines of text as CommonMark inline\n" +- "content.\n\nIn general, a setext heading need not be preceded or followed by a\nblank line. However, it cannot interrupt a paragraph, so when a\nsetext heading comes after a paragraph, a blank line is needed between\nthem.\n\nSimple examples:\n" - "\n```````````````````````````````` example\nFoo *bar*\n=========\n\nFoo *bar*\n---------\n.\n

    Foo bar

    \n

    Foo bar

    \n````````````````````````````````\n\n\nThe content of the header may span more than one line:\n" - "\n```````````````````````````````` example\nFoo *bar\nbaz*\n====\n.\n

    Foo bar\nbaz

    \n````````````````````````````````" - "\n\nThe contents are the result of parsing the headings's raw\ncontent as inlines. The heading's raw content is formed by\nconcatenating the lines and removing initial and final\nspaces or tabs.\n" @@ -220,9 +219,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\nFoo\nbar\n* * *\nbaz\n.\n

    Foo\nbar

    \n
    \n

    baz

    \n````````````````````````````````\n\n\nAuthors who want interpretation 3 can use backslash escapes:\n" - "\n```````````````````````````````` example\nFoo\nbar\n\\---\nbaz\n.\n

    Foo\nbar\n---\nbaz

    \n````````````````````````````````\n\n\n" - "## Indented code blocks\n\n" -- "An [indented code block](@) is composed of one or more\n[indented chunks] separated by blank lines.\nAn [indented chunk](@)" -- " is a sequence of non-blank lines,\neach preceded by four or more spaces of indentation. The contents of the code\nblock are the literal contents of the lines, including trailing\n[line endings], minus four spaces of indentation.\n" -- "An indented code block has no [info string].\n\n" +- "An [indented code block](@) is composed of one or more\n[indented chunks] separated by blank lines.\nAn [indented chunk](@) is a sequence of non-blank lines,\neach preceded by four or more spaces of indentation. The contents of the code\n" +- "block are the literal contents of the lines, including trailing\n[line endings], minus four spaces of indentation.\nAn indented code block has no [info string].\n\n" - "An indented code block cannot interrupt a paragraph, so there must be\na blank line between a paragraph and a following indented code block.\n(A blank line is not needed, however, between a code block and a following\nparagraph.)\n" - "\n```````````````````````````````` example\n a simple\n indented code block\n.\n
    a simple\n  indented code block\n
    \n````````````````````````````````" - "\n\n\nIf there is any ambiguity between an interpretation of indentation\nas a code block and as indicating that material belongs to a [list\nitem][list items], the list item interpretation takes precedence:\n" @@ -241,8 +239,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nTrailing spaces or tabs are included in the code block's content:\n\n```````````````````````````````` example\n foo \n.\n
    foo  \n
    \n````````````````````````````````\n\n\n\n" - "## Fenced code blocks\n\n" - "A [code fence](@) is a sequence\nof at least three consecutive backtick characters (`` ` ``) or\ntildes (`~`). (Tildes and backticks cannot be mixed.)\nA [fenced code block](@)\nbegins with a code fence, preceded by up to three spaces of indentation.\n" -- "\nThe line with the opening code fence may optionally contain some text\nfollowing the code fence; this is trimmed of leading and trailing\nspaces or tabs and called the [info string](@)" -- ". If the [info string] comes\nafter a backtick fence, it may not contain any backtick\ncharacters. (The reason for this restriction is that otherwise\nsome inline code would be incorrectly interpreted as the\nbeginning of a fenced code block.)\n\n" +- "\nThe line with the opening code fence may optionally contain some text\nfollowing the code fence; this is trimmed of leading and trailing\nspaces or tabs and called the [info string](@). If the [info string] comes\n" +- "after a backtick fence, it may not contain any backtick\ncharacters. (The reason for this restriction is that otherwise\nsome inline code would be incorrectly interpreted as the\nbeginning of a fenced code block.)\n\n" - "The content of the code block consists of all subsequent lines, until\na closing [code fence] of the same type as the code block\nbegan with (backticks or tildes), and with at least as many backticks\n" - "or tildes as the opening code fence. If the leading code fence is\npreceded by N spaces of indentation, then up to N spaces of indentation are\nremoved from each line of the content (if present). (If a content line is not\n" - "indented, it is preserved unchanged. If it is indented N spaces or less, all\nof the indentation is removed.)\n\n" @@ -283,9 +281,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
    foo\n
    \n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:\n" - "\n```````````````````````````````` example\n```\n``` aaa\n```\n.\n
    ``` aaa\n
    \n````````````````````````````````\n\n\n\n" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n" -- "\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@)" -- " (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line.\n\n" +- "\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\n" +- "It ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [" +- "end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line.\n\n" - "1. " - "**Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, " - "``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n" @@ -323,21 +321,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\nIn type 7 blocks, the [tag name] can be anything:\n" - "\n```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\n```````````````````````````````` example\n\n*bar*\n.\n\n*bar*\n````````````````````````````````" -- "\n\n\nThese rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n``" -- " tags in three different ways. In this case, we get a raw\nHTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" +- "\n\n\nThese rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n`` tags in three different ways. In this case, we get a raw\n" +- "HTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" - "\n\n\nIn this case, we get a raw HTML block that just includes\nthe `` tag (because it ends with the following blank\nline). So the contents get interpreted as CommonMark:\n" - "\n```````````````````````````````` example\n\n\n*foo*\n\n\n.\n\n

    foo

    \n
    \n````````````````````````````````" - "\n\n\nFinally, in this case, the `` tags are interpreted\nas [raw HTML] *inside* the CommonMark paragraph. (Because\nthe tag is not on a line by itself, we get inline HTML\nrather than an [HTML block].)\n" - "\n```````````````````````````````` example\n*foo*\n.\n

    foo

    \n````````````````````````````````" -- "\n\n\nHTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`" -- "), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks\nend at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\n" -- "A pre tag (type 1):\n" -- "\n```````````````````````````````` example\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\n"
    -- "main = print $ parseTags tags\n
    \n

    okay

    \n````````````````````````````````\n\n\nA script tag (type 1):\n" -- "\n```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nA textarea tag (type 1):\n" -- "\n```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\nA style tag (type 1):\n" -- "\n```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

    okay

    \n````````````````````````````````" +- "\n\n\nHTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks\n" +- "end at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\nA pre tag (type 1):\n" +- "\n```````````````````````````````` example\n" +- "
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \n

    okay

    \n" +- "````````````````````````````````\n\n\nA script tag (type 1):\n" +- "\n```````````````````````````````` example\n" +- "\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nA textarea tag (type 1):\n\n```````````````````````````````` example\n\n.\n\n````````````````````````````````" +- "\n\nA style tag (type 1):\n\n```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

    okay

    \n````````````````````````````````" - "\n\n\nIf there is no matching end tag, the block will end at the\nend of the document (or the enclosing [block quote][block quotes]\nor [list item][list items]):\n" - "\n```````````````````````````````` example\n\n\nfoo\n.\n\n\nfoo\n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n>
    \n> foo\n\nbar\n.\n
    \n
    \nfoo\n
    \n

    bar

    \n````````````````````````````````" @@ -348,8 +346,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\n\nA processing instruction (type 3):\n" - "\n```````````````````````````````` example\n';\n\n?>\nokay\n.\n';\n\n?>\n

    okay

    \n````````````````````````````````\n\n\nA declaration (type 4):\n" - "\n```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\n\nCDATA (type 5):\n" -- "\n```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:\n" +- "\n```````````````````````````````` example\n" +- "\nokay\n.\n\n

    okay

    \n" +- "````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:\n" - "\n```````````````````````````````` example\n \n\n \n.\n \n
    <!-- foo -->\n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n
    \n\n
    \n.\n
    \n
    <div>\n
    \n````````````````````````````````\n\n\nAn HTML block of types 1--6 can interrupt a paragraph, and need not be\npreceded by a blank line.\n" - "\n```````````````````````````````` example\nFoo\n
    \nbar\n
    \n.\n

    Foo

    \n
    \nbar\n
    \n````````````````````````````````" @@ -366,17 +365,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "and flexible way of including Markdown content inside HTML tags:\nsimply separate the Markdown from the HTML using blank lines:\n\nCompare:\n" - "\n```````````````````````````````` example\n
    \n\n*Emphasized* text.\n\n
    \n.\n
    \n

    Emphasized text.

    \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n
    \n*Emphasized* text.\n
    \n.\n
    \n*Emphasized* text.\n
    \n````````````````````````````````" -- "\n\n\nSome Markdown implementations have adopted a convention of\ninterpreting content inside tags as text if the open tag has\nthe attribute `markdown=1`" -- ". The rule given above seems a simpler and\nmore elegant way of achieving the same expressive power, which is also\nmuch simpler to parse." -- "\n\nThe main potential drawback is that one can no longer paste HTML\nblocks into Markdown documents with 100% reliability. However,\n*in most cases* this will work fine, because the blank lines in\nHTML are usually followed by HTML block tags. For example:\n" +- "\n\n\nSome Markdown implementations have adopted a convention of\ninterpreting content inside tags as text if the open tag has\nthe attribute `markdown=1`. The rule given above seems a simpler and\n" +- "more elegant way of achieving the same expressive power, which is also\nmuch simpler to parse.\n\n" +- "The main potential drawback is that one can no longer paste HTML\nblocks into Markdown documents with 100% reliability. However,\n*in most cases* this will work fine, because the blank lines in\nHTML are usually followed by HTML block tags. For example:\n" - "\n```````````````````````````````` example\n
    \n\n\n\n\n\n\n\n
    \nHi\n
    \n.\n\n\n\n\n
    \nHi\n
    \n````````````````````````````````" - "\n\n\nThere are problems, however, if the inner tags are indented\n*and* separated by spaces, as then they will be interpreted as\nan indented code block:\n" - "\n```````````````````````````````` example\n\n\n \n\n \n\n \n\n
    \n Hi\n
    \n.\n\n \n
    <td>\n  Hi\n</td>\n
    \n \n
    \n````````````````````````````````" - "\n\n\nFortunately, blank lines are usually not necessary and can be\ndeleted. The exception is inside `
    ` tags, but as described\n[above][HTML blocks], raw HTML blocks starting with `
    `\n*can* contain blank lines.\n\n"
     - "## Link reference definitions\n\n"
    -- "A [link reference definition](@)\nconsists of a [link label], optionally preceded by up to three spaces of\nindentation, followed\nby a colon (`:`"
    -- "), optional spaces or tabs (including up to one\n[line ending]), a [link destination],\noptional spaces or tabs (including up to one\n[line ending]), and an optional [link\ntitle], which if it is present must be separated\nfrom the [link destination]"
    -- " by spaces or tabs.\nNo further character may occur.\n\n"
    +- "A [link reference definition](@)\nconsists of a [link label], optionally preceded by up to three spaces of\nindentation, followed\nby a colon (`:`), optional spaces or tabs (including up to one\n[line ending]), a [link destination],\n"
    +- "optional spaces or tabs (including up to one\n[line ending]), and an optional [link\ntitle], which if it is present must be separated\nfrom the [link destination] by spaces or tabs.\nNo further character may occur.\n\n"
     - "A [link reference definition]\ndoes not correspond to a structural element of a document.  Instead, it\ndefines a label which can be used in [reference links]\nand reference-style [images] elsewhere in the document.  [Link\nreference definitions]"
     - " can come either before or after the links that use\nthem.\n\n```````````````````````````````` example\n[foo]: /url \"title\"\n\n[foo]\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n [foo]: \n /url \n 'the title' \n\n[foo]\n.\n

    foo

    \n````````````````````````````````" @@ -408,10 +406,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n\n\n[Link reference definitions] can occur\ninside block containers, like lists and block quotations. They\naffect the entire document, not just the container in which they\nare defined:\n" - "\n```````````````````````````````` example\n[foo]\n\n> [foo]: /url\n.\n

    foo

    \n
    \n
    \n````````````````````````````````\n\n\n" - "## Paragraphs\n\n" -- "A sequence of non-blank lines that cannot be interpreted as other\nkinds of blocks forms a [paragraph](@)" -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph's raw content as inlines. The paragraph's raw content\nis formed by concatenating the lines and removing initial and final\nspaces or tabs.\n\nA simple example with two paragraphs:\n" -- "\n```````````````````````````````` example\naaa\n\nbbb\n.\n

    aaa

    \n

    bbb

    \n````````````````````````````````\n\n\nParagraphs can contain multiple lines, but no blank lines:\n" -- "\n```````````````````````````````` example\naaa\nbbb\n\nccc\nddd\n.\n

    aaa\nbbb

    \n

    ccc\nddd

    \n````````````````````````````````\n\n\nMultiple blank lines between paragraphs have no effect:\n" +- "A sequence of non-blank lines that cannot be interpreted as other\nkinds of blocks forms a [paragraph](@).\nThe contents of the paragraph are the result of parsing the\nparagraph's raw content as inlines. The paragraph's raw content\n" +- "is formed by concatenating the lines and removing initial and final\nspaces or tabs.\n\nA simple example with two paragraphs:\n\n```````````````````````````````` example\naaa\n\nbbb\n.\n

    aaa

    \n

    bbb

    \n````````````````````````````````" +- "\n\n\nParagraphs can contain multiple lines, but no blank lines:\n\n```````````````````````````````` example\naaa\nbbb\n\nccc\nddd\n.\n

    aaa\nbbb

    \n

    ccc\nddd

    \n````````````````````````````````\n\n\nMultiple blank lines between paragraphs have no effect:\n" - "\n```````````````````````````````` example\naaa\n\n\nbbb\n.\n

    aaa

    \n

    bbb

    \n````````````````````````````````\n\n\nLeading spaces or tabs are skipped:\n\n```````````````````````````````` example\n aaa\n bbb\n.\n

    aaa\nbbb

    \n````````````````````````````````" - "\n\n\nLines after the first may be indented any amount, since indented\ncode blocks cannot interrupt paragraphs.\n" - "\n```````````````````````````````` example\naaa\n bbb\n ccc\n.\n

    aaa\nbbb\nccc

    \n````````````````````````````````" @@ -423,16 +420,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n \n\naaa\n \n\n# aaa\n\n \n.\n

    aaa

    \n

    aaa

    \n````````````````````````````````\n\n\n\n" - "# Container blocks\n\nA [container block](#container-blocks) is a block that has other\nblocks as its contents. There are two basic kinds of container blocks:\n[block quotes] and [list items].\n[Lists] are meta-containers for [list items].\n" - "\nWe define the syntax for container blocks recursively. The general\nform of the definition is:\n\n> If X is a sequence of blocks, then the result of\n> transforming X in such-and-such a way is a container of type Y\n> with these blocks as its content.\n" -- "\nSo, we explain what counts as a block quote or list item by explaining\nhow these can be *generated* from their contents. This should suffice\nto define the syntax, although it does not give a recipe for *parsing*" -- "\nthese constructions. (A recipe is provided below in the section entitled\n[A parsing strategy](#appendix-a-parsing-strategy).)\n\n" +- "\nSo, we explain what counts as a block quote or list item by explaining\nhow these can be *generated* from their contents. This should suffice\nto define the syntax, although it does not give a recipe for *parsing*\n" +- "these constructions. (A recipe is provided below in the section entitled\n[A parsing strategy](#appendix-a-parsing-strategy).)\n\n" - "## Block quotes\n\nA [block quote marker](@),\noptionally preceded by up to three spaces of indentation,\nconsists of (a) the character `>` together with a following space of\nindentation, or (b) a single character `>` not followed by a space of\nindentation.\n" - "\nThe following rules define [block quotes]:\n\n" - "1. **Basic case.** If a string of lines *Ls* constitute a sequence\n of blocks *Bs*, then the result of prepending a [block quote\n marker] to the beginning of each line in *Ls*\n is a [block quote](#block-quotes) containing *Bs*.\n\n" - "2. " -- "**Laziness.** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*" -- ", then the result of deleting\n the initial [block quote marker] from one or\n more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs*" -- " as its content.\n [Paragraph continuation text](@) is text\n that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph.\n\n" -- "3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them.\n\n" +- "**Laziness.** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*, then the result of deleting\n the initial [block quote marker] from one or\n " +- "more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs* as its content.\n [Paragraph continuation text](@) is text\n " +- "that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph.\n\n3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them.\n\n" - "Nothing else counts as a [block quote](#block-quotes).\n\nHere is a simple example:\n\n```````````````````````````````` example\n> # Foo\n> bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "\n\n\nThe space or tab after the `>` characters can be omitted:\n\n```````````````````````````````` example\n># Foo\n>bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "\n\n\nThe `>` characters can be preceded by up to three spaces of indentation:\n\n```````````````````````````````` example\n > # Foo\n > bar\n > baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" @@ -467,16 +463,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n" - "\nThe following rules define [list items]:\n\n" - "1. " -- "**Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N*" -- " ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs*" -- " as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n " -- "1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls*" -- " must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" +- "**Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n " +- "then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n " +- "(bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n " +- "1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if" +- "\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n" - "\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n" - "````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n" -- "\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\n" +- "\n```````````````````````````````` example\n" +- "1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\n" - "The most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\n" - "marker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n\n" - "Here are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````" @@ -487,9 +484,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The spaces of indentation after the list marker determine how much relative\nindentation is needed. Which column this indentation reaches will depend on\nhow the list item is embedded in other constructions, as shown by\nthis example:\n\n" - "```````````````````````````````` example\n > > 1. one\n>>\n>> two\n.\n
    \n
    \n
      \n
    1. \n

      one

      \n

      two

      \n
    2. \n
    \n
    \n
    \n````````````````````````````````" - "\n\n\nHere `two` occurs in the same column as the list marker `1.`,\nbut is actually contained in the list item, because there is\nsufficient indentation after the last containing blockquote marker.\n" -- "\nThe converse is also possible. In the following example, the word `two`\noccurs far to the right of the initial text of the list item, `one`" -- ", but\nit is not considered part of the list item, because it is not indented\nfar enough past the blockquote marker:\n\n" -- "```````````````````````````````` example\n>>- one\n>>\n > > two\n.\n
    \n
    \n
      \n
    • one
    • \n
    \n

    two

    \n
    \n
    \n````````````````````````````````" +- "\nThe converse is also possible. In the following example, the word `two`\noccurs far to the right of the initial text of the list item, `one`, but\nit is not considered part of the list item, because it is not indented\nfar enough past the blockquote marker:" +- "\n\n```````````````````````````````` example\n>>- one\n>>\n > > two\n.\n
    \n
    \n
      \n
    • one
    • \n
    \n

    two

    \n
    \n
    \n````````````````````````````````" - "\n\n\nNote that at least one space or tab is needed between the list marker and\nany following content, so these are not list items:\n\n```````````````````````````````` example\n-one\n\n2.two\n.\n

    -one

    \n

    2.two

    \n````````````````````````````````" - "\n\n\nA list item may contain blocks that are separated by more than\none blank line.\n\n```````````````````````````````` example\n- foo\n\n\n bar\n.\n
      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````" - "\n\n\nA list item may contain any kind of block:\n" @@ -500,9 +496,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nA start number may begin with 0s:\n\n```````````````````````````````` example\n0. ok\n.\n
      \n
    1. ok
    2. \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n003. ok\n.\n
      \n
    1. ok
    2. \n
    \n````````````````````````````````\n\n\nA start number may not be negative:\n" - "\n```````````````````````````````` example\n-1. not ok\n.\n

    -1. not ok

    \n````````````````````````````````\n\n\n\n" -- "2. **Item starting with indented code.** If a sequence of lines *Ls*\n constitute a sequence of blocks *Bs* starting with an indented code\n block, and *M* is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n " +- "2. **Item starting with indented code.** If a sequence of lines *Ls*\n constitute a sequence of blocks *Bs* starting with an indented code\n block, and *M* is a list marker of width *W* followed by\n " +- "one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents.\n " +- "If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n " - "start number, based on the ordered list marker.\n\nAn indented code block will have to be preceded by four spaces of indentation\nbeyond the edge of the region where text will be included in the list item.\nIn the following case that is 6 spaces:\n" - "\n```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n
      bar\n
      \n
    • \n
    \n````````````````````````````````\n\n\nAnd in this case it is 11 spaces:\n" - "\n```````````````````````````````` example\n 10. foo\n\n bar\n.\n
      \n
    1. \n

      foo

      \n
      bar\n
      \n
    2. \n
    \n````````````````````````````````" @@ -517,9 +513,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nThis is not a significant restriction, because when a block is preceded by up to\nthree spaces of indentation, the indentation can always be removed without\na change in interpretation, allowing rule #1 to be applied. So, in\nthe above case:\n" - "\n```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````\n\n\n" - "3. **Item starting with a blank line.** If a sequence of lines *Ls*\n starting with a single [blank line] constitute a (possibly empty)\n sequence of blocks *Bs*, and *M* is a list marker of width *W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n " -- "start number, based on the ordered list marker.\n\nHere are some list items that start with a blank line but are not empty:\n" +- " to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs* as its contents.\n If a line is empty, then it need not be indented. The type of the\n " +- "list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n start number, based on the ordered list marker.\n\n" +- "Here are some list items that start with a blank line but are not empty:\n" - "\n```````````````````````````````` example\n-\n foo\n-\n ```\n bar\n ```\n-\n baz\n.\n
      \n
    • foo
    • \n
    • \n
      bar\n
      \n
    • \n
    • \n
      baz\n
      \n
    • \n
    \n````````````````````````````````" - "\n\nWhen the list item starts with a blank line, the number of spaces\nfollowing the list marker doesn't change the required indentation:\n\n```````````````````````````````` example\n- \n foo\n.\n
      \n
    • foo
    • \n
    \n````````````````````````````````" - "\n\n\nA list item can begin with at most one blank line.\nIn the following example, `foo` is not part of the list\nitem:\n\n```````````````````````````````` example\n-\n\n foo\n.\n
      \n
    • \n
    \n

    foo

    \n````````````````````````````````" @@ -528,22 +524,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nHere is an empty ordered list item:\n\n```````````````````````````````` example\n1. foo\n2.\n3. bar\n.\n
      \n
    1. foo
    2. \n
    3. \n
    4. bar
    5. \n
    \n````````````````````````````````\n\n\nA list may start or end with an empty list item:\n" - "\n```````````````````````````````` example\n*\n.\n
      \n
    • \n
    \n````````````````````````````````\n\nHowever, an empty list item cannot interrupt a paragraph:\n" - "\n```````````````````````````````` example\nfoo\n*\n\nfoo\n1.\n.\n

    foo\n*

    \n

    foo\n1.

    \n````````````````````````````````\n\n\n" -- "4. **Indentation.** If a sequence of lines *Ls* constitutes a list item\n according to rule #1, #2, or #3, then the result of preceding each line\n of *Ls*" -- " by up to three spaces of indentation (the same for each line) also\n constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented two spaces:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented three spaces:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nFour spaces indent gives a code block:\n" +- "4. **Indentation.** If a sequence of lines *Ls* constitutes a list item\n according to rule #1, #2, or #3, then the result of preceding each line\n of *Ls* by up to three spaces of indentation (the same for each line) also\n " +- "constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nIndented two spaces:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nIndented three spaces:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nFour spaces indent gives a code block:\n" - "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
    1.  A paragraph\n    with two lines.\n\n        indented code\n\n    > A block quote.\n
    \n" - "````````````````````````````````\n\n\n\n" -- "5. **Laziness.** If a string of lines *Ls* constitute a [list\n item](#list-items) with contents *Bs*" -- ", then the result of deleting\n some or all of the indentation from one or more lines in which the\n next character other than a space or tab after the indentation is\n [paragraph continuation text] is a\n " -- "list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s.\n\nHere is an example with [lazy continuation lines]:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndentation can be partially deleted:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````\n\n\nThese examples show how laziness can work in nested structures:\n" +- "5. **Laziness.** If a string of lines *Ls* constitute a [list\n item](#list-items) with contents *Bs*, then the result of deleting\n some or all of the indentation from one or more lines in which the\n " +- "next character other than a space or tab after the indentation is\n [paragraph continuation text] is a\n list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s.\n\n" +- "Here is an example with [lazy continuation lines]:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nIndentation can be partially deleted:\n\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````" +- "\n\n\nThese examples show how laziness can work in nested structures:\n" - "\n```````````````````````````````` example\n> 1. > Blockquote\ncontinued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n> 1. > Blockquote\n> continued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````\n\n\n\n" - "6. **That's all.** Nothing that is not counted as a list item by rules\n #1--5 counts as a [list item](#list-items).\n\n" @@ -564,8 +564,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "These rules specify that a paragraph under a list item must be indented\nfour spaces (presumably, from the left margin, rather than the start of\nthe list marker, but this is not said), and that code under a list item\n" - "must be indented eight spaces instead of the usual four. They also say\nthat a block quote must be indented, but not by how much; however, the\nexample given has four spaces indentation. Although nothing is said\n" - "about other kinds of block-level content, it is certainly reasonable to\ninfer that *all* block elements under a list item, including other\nlists, must be indented four spaces. This principle has been called the\n*four-space rule*.\n\n" -- "The four-space rule is clear and principled, and if the reference\nimplementation `Markdown.pl` had followed it, it probably would have\nbecome the standard. However, `Markdown.pl`" -- " allowed paragraphs and\nsublists to start with only two spaces indentation, at least on the\nouter level. Worse, its behavior was inconsistent: a sublist of an\nouter-level list needed two spaces indentation, but a sublist of this\n" +- "The four-space rule is clear and principled, and if the reference\nimplementation `Markdown.pl` had followed it, it probably would have\nbecome the standard. However, `Markdown.pl` allowed paragraphs and\n" +- "sublists to start with only two spaces indentation, at least on the\nouter level. Worse, its behavior was inconsistent: a sublist of an\nouter-level list needed two spaces indentation, but a sublist of this\n" - "sublist needed three spaces. It is not surprising, then, that different\nimplementations of Markdown have developed very different rules for\ndetermining what comes under a list item. (Pandoc and python-Markdown,\nfor example, stuck with Gruber'" - "s syntax description and the four-space\nrule, while discount, redcarpet, marked, PHP Markdown, and others\nfollowed `Markdown.pl`'s behavior more closely.)\n\n" - "Unfortunately, given the divergences between implementations, there\nis no way to give a spec for list items that will be guaranteed not\nto break any existing documents. However, the spec given here should\n" @@ -575,24 +575,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This rule is superior, we claim, to any rule requiring a fixed level of\nindentation from the margin. The four-space rule is clear but\nunnatural. It is quite unintuitive that\n\n``` markdown\n- foo\n\n bar\n\n - baz\n```" - "\n\nshould be parsed as two lists with an intervening paragraph,\n\n``` html\n
      \n
    • foo
    • \n
    \n

    bar

    \n
      \n
    • baz
    • \n
    \n```\n\nas the four-space rule demands, rather than a single list,\n" - "\n``` html\n
      \n
    • \n

      foo

      \n

      bar

      \n
        \n
      • baz
      • \n
      \n
    • \n
    \n```\n\nThe choice of four spaces is arbitrary. It can be learned, but it is\nnot likely to be guessed, and it trips up beginners regularly.\n" -- "\nWould it help to adopt a two-space rule? The problem is that such\na rule, together with the rule allowing up to three spaces of indentation for\nthe initial list marker, allows text that is indented *less than*" -- " the\noriginal list marker to be included in the list item. For example,\n`Markdown.pl` parses\n\n``` markdown\n - one\n\n two\n```\n\nas a single list item, with `two` a continuation paragraph:\n\n``` html\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n```" -- "\n\nand similarly\n\n``` markdown\n> - one\n>\n> two\n```\n\nas\n\n``` html\n
    \n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n
    \n```\n\nThis is extremely unintuitive.\n" +- "\nWould it help to adopt a two-space rule? The problem is that such\na rule, together with the rule allowing up to three spaces of indentation for\nthe initial list marker, allows text that is indented *less than* the\n" +- "original list marker to be included in the list item. For example,\n`Markdown.pl` parses\n\n``` markdown\n - one\n\n two\n```\n\nas a single list item, with `two` a continuation paragraph:\n\n``` html\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n```\n\nand similarly\n" +- "\n``` markdown\n> - one\n>\n> two\n```\n\nas\n\n``` html\n
    \n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n
    \n```\n\nThis is extremely unintuitive.\n" - "\nRather than requiring a fixed indent from the margin, we could require\na fixed indent (say, two spaces, or even one space) from the list marker (which\nmay itself be indented). This proposal would remove the last anomaly\n" - "discussed. Unlike the spec presented above, it would count the following\nas a list item with a subparagraph, even though the paragraph `bar`\nis not indented as far as the first paragraph `foo`:\n\n``` markdown\n 10. foo\n\n bar \n```" -- "\n\nArguably this text does read like a list item with `bar`" -- " as a subparagraph,\nwhich may count in favor of the proposal. However, on this proposal indented\ncode would have to be indented six spaces after the list marker. And this\nwould break a lot of existing Markdown, which has the pattern:\n\n" -- "``` markdown\n1. foo\n\n indented code\n```\n\nwhere the code is indented eight spaces. The spec above, by contrast, will\nparse this text as expected, since the code block's indentation is measured\nfrom the beginning of `foo`.\n" -- "\nThe one case that needs special treatment is a list item that *starts*" -- "\nwith indented code. How much indentation is required in that case, since\nwe don't have a \"first paragraph\" to measure from? Rule #2 simply stipulates\nthat in such cases, we require one space indentation from the list marker\n" -- "(and then the normal four spaces for the indented code). This will match the\nfour-space rule in cases where the list marker plus its initial indentation\ntakes four spaces (a common case), but diverge in other cases.\n\n" +- "\n\nArguably this text does read like a list item with `bar` as a subparagraph,\nwhich may count in favor of the proposal. However, on this proposal indented\ncode would have to be indented six spaces after the list marker. And this\n" +- "would break a lot of existing Markdown, which has the pattern:\n\n``` markdown\n1. foo\n\n indented code\n```" +- "\n\nwhere the code is indented eight spaces. The spec above, by contrast, will\nparse this text as expected, since the code block's indentation is measured\nfrom the beginning of `foo`.\n" +- "\nThe one case that needs special treatment is a list item that *starts*\nwith indented code. How much indentation is required in that case, since\nwe don't have a \"first paragraph\" to measure from? Rule #2 simply stipulates\n" +- "that in such cases, we require one space indentation from the list marker\n(and then the normal four spaces for the indented code). This will match the\nfour-space rule in cases where the list marker plus its initial indentation\n" +- "takes four spaces (a common case), but diverge in other cases.\n\n" - "## Lists\n\nA [list](@) is a sequence of one or more\nlist items [of the same type]. The list items\nmay be separated by any number of blank lines.\n" - "\nTwo list items are [of the same type](@)\nif they begin with a [list marker] of the same type.\nTwo list markers are of the\nsame type if (a) they are bullet list markers using the same character\n(`-`, `+`, or `*`" - ") or (b) they are ordered list numbers with the same\ndelimiter (either `.` or `)`).\n\n" - "A list is an [ordered list](@)\nif its constituent list items begin with\n[ordered list markers], and a\n[bullet list](@) if its constituent list\nitems begin with [bullet list markers].\n" - "\nThe [start number](@)\nof an [ordered list] is determined by the list number of\nits initial list item. The numbers of subsequent list items are\ndisregarded.\n" -- "\nA list is [loose](@) if any of its constituent\nlist items are separated by blank lines, or if any of its constituent\nlist items directly contain two block-level elements with a blank line\nbetween them. Otherwise a list is [tight](@)" -- ".\n(The difference in HTML output is that paragraphs in a loose list are\nwrapped in `

    ` tags, while paragraphs in a tight list are not.)\n\nChanging the bullet or ordered list delimiter starts a new list:\n" +- "\nA list is [loose](@) if any of its constituent\nlist items are separated by blank lines, or if any of its constituent\nlist items directly contain two block-level elements with a blank line\nbetween them. Otherwise a list is [tight](@).\n" +- "(The difference in HTML output is that paragraphs in a loose list are\nwrapped in `

    ` tags, while paragraphs in a tight list are not.)\n\nChanging the bullet or ordered list delimiter starts a new list:\n" - "\n```````````````````````````````` example\n- foo\n- bar\n+ baz\n.\n

      \n
    • foo
    • \n
    • bar
    • \n
    \n
      \n
    • baz
    • \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n1. foo\n2. bar\n3) baz\n.\n
      \n
    1. foo
    2. \n
    3. bar
    4. \n
    \n
      \n
    1. baz
    2. \n
    \n````````````````````````````````" - "\n\n\nIn CommonMark, a list can interrupt a paragraph. That is,\nno blank line is needed to separate a paragraph from a following\nlist:\n" @@ -603,8 +603,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n(Indeed, the spec for [list items] and [block quotes] presupposes\nthis principle.) This principle implies that if\n\n``` markdown\n * I need to buy\n - new shoes\n - a coat\n - a plane ticket\n```" - "\n\nis a list item containing a paragraph followed by a nested sublist,\nas all Markdown implementations agree it is (though the paragraph\nmay be rendered without `

    ` tags, since the list is \"tight\"),\nthen\n" - "\n``` markdown\nI need to buy\n- new shoes\n- a coat\n- a plane ticket\n```\n\nby itself should be a paragraph followed by a nested sublist.\n" -- "\nSince it is well established Markdown practice to allow lists to\ninterrupt paragraphs inside list items, the [principle of\nuniformity] requires us to allow this outside list items as\nwell. ([reStructuredText](https://docutils.sourceforge.net/rst.html)" -- "\ntakes a different approach, requiring blank lines before lists\neven inside other list items.)\n\nIn order to solve the problem of unwanted lists in paragraphs with\nhard-wrapped numerals, we allow only lists starting with `1` to\ninterrupt paragraphs. Thus,\n" +- "\nSince it is well established Markdown practice to allow lists to\ninterrupt paragraphs inside list items, the [principle of\nuniformity] requires us to allow this outside list items as\nwell. ([reStructuredText](https://docutils.sourceforge.net/rst.html)\n" +- "takes a different approach, requiring blank lines before lists\neven inside other list items.)\n\nIn order to solve the problem of unwanted lists in paragraphs with\nhard-wrapped numerals, we allow only lists starting with `1` to\ninterrupt paragraphs. Thus,\n" - "\n```````````````````````````````` example\nThe number of windows in my house is\n14. The number of doors is 6.\n.\n

    The number of windows in my house is\n14. The number of doors is 6.

    \n````````````````````````````````" - "\n\nWe may still get an unintended result in cases like\n" - "\n```````````````````````````````` example\nThe number of windows in my house is\n1. The number of doors is 6.\n.\n

    The number of windows in my house is

    \n
      \n
    1. The number of doors is 6.
    2. \n
    \n````````````````````````````````" @@ -642,8 +642,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Code spans\n\nA [backtick string](@)\nis a string of one or more backtick characters (`` ` ``) that is neither\npreceded nor followed by a backtick.\n" - "\nA [code span](@) begins with a backtick string and ends with\na backtick string of equal length. The contents of the code span are\nthe characters between these two backtick strings, normalized in the\nfollowing ways:\n\n" - "- First, [line endings] are converted to [spaces].\n" -- "- If the resulting string both begins *and*" -- " ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back. This allows you to include code that begins\n " +- "- If the resulting string both begins *and* ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back. This allows you to include code that begins\n " - "or ends with backtick characters, which must be separated by\n whitespace from the opening or closing backtick strings.\n\nThis is a simple code span:\n\n```````````````````````````````` example\n`foo`\n.\n

    foo

    \n````````````````````````````````" - "\n\n\nHere two backticks are used, because the code contains a backtick.\nThis example also illustrates stripping of a single leading and\ntrailing space:\n" - "\n```````````````````````````````` example\n`` foo ` bar ``\n.\n

    foo ` bar

    \n````````````````````````````````\n\n\nThis example shows the motivation for stripping leading and trailing\nspaces:\n" @@ -670,49 +669,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n`foo``bar``\n.\n

    `foobar

    \n````````````````````````````````\n\n\n" - "## Emphasis and strong emphasis\n\nJohn Gruber's original [Markdown syntax\ndescription](https://daringfireball.net/projects/markdown/syntax#em) says:\n\n" - "> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\n> emphasis. Text wrapped with one `*` or `_` will be wrapped with an HTML\n> `` tag; double `*`'s or `_`'s will be wrapped with an HTML ``\n> tag.\n" -- "\nThis is enough for most users, but these rules leave much undecided,\nespecially when it comes to nested emphasis. The original\n`Markdown.pl` test suite makes it clear that triple `***` and\n`___`" -- " delimiters can be used for strong emphasis, and most\nimplementations have also allowed the following patterns:\n\n``` markdown\n***strong emph***\n***strong** in emph*\n***emph* in strong**\n**in strong *emph***\n*in emph **strong***\n```" +- "\nThis is enough for most users, but these rules leave much undecided,\nespecially when it comes to nested emphasis. The original\n`Markdown.pl` test suite makes it clear that triple `***` and\n`___` delimiters can be used for strong emphasis, and most\n" +- "implementations have also allowed the following patterns:\n\n``` markdown\n***strong emph***\n***strong** in emph*\n***emph* in strong**\n**in strong *emph***\n*in emph **strong***\n```" - "\n\nThe following patterns are less widely supported, but the intent\nis clear and they are useful (especially in contexts like bibliography\nentries):\n\n``` markdown\n*emph *with emph* in it*\n**strong **with strong** in it**\n```" - "\n\nMany implementations have also restricted intraword emphasis to\nthe `*` forms, to avoid unwanted emphasis in words containing\ninternal underscores. (It is best practice to put these in code\nspans, but users often do not.)\n" - "\n``` markdown\ninternal emphasis: foo*bar*baz\nno emphasis: foo_bar_baz\n```\n\nThe rules given below capture all of these patterns, while allowing\nfor efficient parsing strategies that do not backtrack.\n" -- "\nFirst, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_`" -- " characters that is not preceded or followed by\na non-backslash-escaped `_` character.\n\n" -- "A [left-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [" -- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\n" -- "A [right-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [" -- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" -- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n" -- " abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" +- "\nFirst, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by\n" +- "a non-backslash-escaped `_` character.\n\n" +- "A [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\n" +- "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" +- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n" +- " - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n" -- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags)" -- ".\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" +- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\"" +- " and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" - "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n" - "2. " -- "A single `_`" -- " character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." -- "\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" +- "A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" - "4. " -- "A single `_`" -- " character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." -- "\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n" +- "A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n" - "6. " -- "A double `__`" -- " [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." -- "\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" +- "A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" - "8. " -- "A double `__`" -- " [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." -- "\n\n9. " -- "Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`" -- ") as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n " -- "delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3." -- "\n\n10. " -- "Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`" -- ") as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n " -- "the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n" -- "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" +- "A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character].\n\n" +- "9. " +- "Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate" +- "\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n " +- "multiples of 3.\n\n" +- "10. " +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n " +- "opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n " +- "delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "Where rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:\n\n" - "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n" @@ -783,9 +776,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n*foo *bar**\n.\n

    foo bar

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n*foo **bar** baz*\n.\n

    foo bar baz

    \n````````````````````````````````" - "\n\n```````````````````````````````` example\n*foo**bar**baz*\n.\n

    foobarbaz

    \n````````````````````````````````\n\nNote that in the preceding case, the interpretation\n\n``` markdown\n

    foobarbaz

    \n```" -- "\n\n\nis precluded by the condition that a delimiter that\ncan both open and close (like the `*` after `foo`" -- ")\ncannot form emphasis if the sum of the lengths of\nthe delimiter runs containing the opening and\nclosing delimiters is a multiple of 3 unless\nboth lengths are multiples of 3.\n\n\n" -- "For the same reason, we don't get two consecutive\nemphasis sections in this example:\n\n```````````````````````````````` example\n*foo**bar*\n.\n

    foo**bar

    \n````````````````````````````````" +- "\n\n\nis precluded by the condition that a delimiter that\ncan both open and close (like the `*` after `foo`)\ncannot form emphasis if the sum of the lengths of\nthe delimiter runs containing the opening and\nclosing delimiters is a multiple of 3 unless\n" +- "both lengths are multiples of 3.\n\n\nFor the same reason, we don't get two consecutive\nemphasis sections in this example:\n\n```````````````````````````````` example\n*foo**bar*\n.\n

    foo**bar

    \n````````````````````````````````" - "\n\n\nThe same condition ensures that the following\ncases are all strong emphasis nested inside\nemphasis, even when the interior whitespace is\nomitted:\n" - "\n\n```````````````````````````````` example\n***foo** bar*\n.\n

    foo bar

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n*foo **bar***\n.\n

    foo bar

    \n````````````````````````````````" @@ -853,17 +845,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis]. Thus, for example, `*[foo*](url)` is a link.\n\nA [link destination](@) consists of either\n\n" - "- a sequence of zero or more characters between an opening `<` and a\n closing `>` that contains no line endings or unescaped\n `<` or `>` characters, or\n\n" - "- " -- "a nonempty sequence of characters that does not start with `<`" -- ",\n does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are\n backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses.\n " -- "(Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)\n\nA [link title](@) consists of either\n\n" +- "a nonempty sequence of characters that does not start with `<`,\n does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are\n " +- "backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses.\n (Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)\n\n" +- "A [link title](@) consists of either\n\n" - "- a sequence of zero or more characters between straight double-quote\n characters (`\"`), including a `\"` character only if it is\n backslash-escaped, or\n\n" - "- a sequence of zero or more characters between straight single-quote\n characters (`'`), including a `'` character only if it is\n backslash-escaped, or\n\n" - "- a sequence of zero or more characters between matching parentheses\n (`(...)`), including a `(` or `)` character only if it is\n backslash-escaped.\n\n" - "Although [link titles] may span multiple lines, they may not contain\na [blank line].\n" -- "\nAn [inline link](@) consists of a [link text] followed immediately\nby a left parenthesis `(`, an optional [link destination], an optional\n[link title], and a right parenthesis `)`" -- ".\nThese four components may be separated by spaces, tabs, and up to one line\nending.\nIf both [link destination] and [link title] are present, they *must* be\nseparated by spaces, tabs, and up to one line ending.\n\n" -- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>`" -- " if present, with backslash-escapes in effect as described\nabove. The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:\n" +- "\nAn [inline link](@) consists of a [link text] followed immediately\nby a left parenthesis `(`, an optional [link destination], an optional\n[link title], and a right parenthesis `)`.\nThese four components may be separated by spaces, tabs, and up to one line" +- "\nending.\nIf both [link destination] and [link title] are present, they *must* be\nseparated by spaces, tabs, and up to one line ending.\n\n" +- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>` if present, with backslash-escapes in effect as described\n" +- "above. The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:\n" - "\n```````````````````````````````` example\n[link](/uri \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nThe title, the link text and even \nthe destination may be omitted:\n" - "\n```````````````````````````````` example\n[link](/uri)\n.\n

    link

    \n````````````````````````````````\n\n```````````````````````````````` example\n[](./target.md)\n.\n

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[link](<>)\n.\n

    link

    \n````````````````````````````````" @@ -880,9 +872,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n[link](foo\\(and\\(bar\\))\n.\n

    link

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\nParentheses and other symbols can also be escaped, as usual\nin Markdown:\n" - "\n```````````````````````````````` example\n[link](foo\\)\\:)\n.\n

    link

    \n````````````````````````````````\n\n\nA link can contain fragment identifiers and queries:\n" -- "\n```````````````````````````````` example\n[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    \n" -- "

    link

    \n````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:\n" -- "\n```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" +- "\n```````````````````````````````` example\n" +- "[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    \n

    link

    \n" +- "````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:\n\n```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" - "\n\n\nURL-escaping should be left alone inside the destination, as all\nURL-escaped characters are also valid URL characters. Entity and\nnumerical character references in the destination will be parsed\n" - "into the corresponding Unicode code points, as usual. These may\nbe optionally URL-escaped when written as HTML, but this spec\ndoes not enforce any particular policy for rendering URLs in\nHTML or other formats. Renderers may make different decisions\n" - "about how to escape or normalize URLs in the output.\n\n```````````````````````````````` example\n[link](foo%20bä)\n.\n

    link

    \n````````````````````````````````" @@ -895,11 +887,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n[link](/url \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nNested balanced quotes are not allowed without escaping:\n" - "\n```````````````````````````````` example\n[link](/url \"title \"and\" title\")\n.\n

    [link](/url "title "and" title")

    \n````````````````````````````````\n\n\nBut it is easy to work around this by using a different quote type:\n" - "\n```````````````````````````````` example\n[link](/url 'title \"and\" title')\n.\n

    link

    \n````````````````````````````````" -- "\n\n\n(Note: `Markdown.pl`" -- " did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this\nbrings, since there are already many ways---backslash escaping,\n" -- "entity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`" -- "'s handling of titles has a number\nof other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`. " -- "`Markdown.pl` 1.0.1 even allows\ntitles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)\n\n" +- "\n\n\n(Note: `Markdown.pl` did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this\nbrings, since there are already many ways---" +- "backslash escaping,\nentity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`'s handling of titles has a number\n" +- "of other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`. `Markdown.pl` 1.0.1 even allows\n" +- "titles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)\n\n" - "Spaces, tabs, and up to one line ending is allowed around the destination and\ntitle:\n\n```````````````````````````````` example\n[link]( /uri\n \"title\" )\n.\n

    link

    \n````````````````````````````````" - "\n\n\nBut it is not allowed between the link text and the\nfollowing parenthesis:\n\n```````````````````````````````` example\n[link] (/uri)\n.\n

    [link] (/uri)

    \n````````````````````````````````" - "\n\n\nThe link text may contain balanced brackets, but not unbalanced ones,\nunless they are escaped:\n\n```````````````````````````````` example\n[link [foo [bar]]](/uri)\n.\n

    link [foo [bar]]

    \n````````````````````````````````" @@ -917,11 +908,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n[foo\n.\n

    [foohttps://example.com/?search=](uri)

    \n````````````````````````````````" - "\n\n\nThere are three kinds of [reference link](@)s:\n[full](#full-reference-link), [collapsed](#collapsed-reference-link),\nand [shortcut](#shortcut-reference-link).\n" - "\nA [full reference link](@)\nconsists of a [link text] immediately followed by a [link label]\nthat [matches] a [link reference definition] elsewhere in the document.\n" -- "\nA [link label](@) begins with a left bracket (`[`) and ends\nwith the first right bracket (`]`" -- ") that is not backslash-escaped.\nBetween these brackets there must be at least one character that is not a space,\ntab, or line ending.\nUnescaped square bracket characters are not allowed inside the\nopening and closing square brackets of [link labels]" -- ". A link\nlabel can have at most 999 characters inside the square\nbrackets.\n\n" -- "One label [matches](@)\nanother just in case their normalized forms are equal. To normalize a\nlabel, strip off the opening and closing brackets,\nperform the *Unicode case fold*" -- ", strip leading and trailing\nspaces, tabs, and line endings, and collapse consecutive internal\nspaces, tabs, and line endings to a single space. If there are multiple\nmatching reference link definitions, the one that comes first in the\n" +- "\nA [link label](@) begins with a left bracket (`[`) and ends\nwith the first right bracket (`]`) that is not backslash-escaped.\nBetween these brackets there must be at least one character that is not a space,\ntab, or line ending.\n" +- "Unescaped square bracket characters are not allowed inside the\nopening and closing square brackets of [link labels]. A link\nlabel can have at most 999 characters inside the square\nbrackets.\n\n" +- "One label [matches](@)\nanother just in case their normalized forms are equal. To normalize a\nlabel, strip off the opening and closing brackets,\nperform the *Unicode case fold*, strip leading and trailing\n" +- "spaces, tabs, and line endings, and collapse consecutive internal\nspaces, tabs, and line endings to a single space. If there are multiple\nmatching reference link definitions, the one that comes first in the\n" - "document is used. (It is desirable in such cases to emit a warning.)\n\nThe link's URI and title are provided by the matching [link\nreference definition].\n\nHere is a simple example:\n" - "\n```````````````````````````````` example\n[foo][bar]\n\n[bar]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````\n\n\nThe rules for the [link text] are the same as with\n[inline links]. Thus:\n" - "\nThe link text may contain balanced brackets, but not unbalanced ones,\nunless they are escaped:\n\n```````````````````````````````` example\n[link [foo [bar]]][ref]\n\n[ref]: /uri\n.\n

    link [foo [bar]]

    \n````````````````````````````````" @@ -944,9 +934,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nThis is a departure from John Gruber's original Markdown syntax\ndescription, which explicitly allows whitespace between the link\ntext and the link label. It brings reference links in line with\n[inline links]" - ", which (according to both original Markdown and\nthis spec) cannot have whitespace after the link text. More\nimportantly, it prevents inadvertent capture of consecutive\n[shortcut reference links]. If whitespace is allowed between the\n" - "link text and the link label, then in the following we will have\na single reference link, not two shortcut reference links, as\nintended:\n\n``` markdown\n[foo]\n[bar]\n\n[foo]: /url1\n[bar]: /url2\n```" -- "\n\n(Note that [shortcut reference links] were introduced by Gruber\nhimself in a beta version of `Markdown.pl`" -- ", but never included\nin the official syntax description. Without shortcut reference\nlinks, it is harmless to allow space between the link text and\nlink label; but once shortcut references are introduced, it is\n" -- "too dangerous to allow this, as it frequently leads to\nunintended results.)\n\nWhen there are multiple matching [link reference definitions],\nthe first is used:\n" +- "\n\n(Note that [shortcut reference links] were introduced by Gruber\nhimself in a beta version of `Markdown.pl`, but never included\nin the official syntax description. Without shortcut reference\nlinks, it is harmless to allow space between the link text and\n" +- "link label; but once shortcut references are introduced, it is\ntoo dangerous to allow this, as it frequently leads to\nunintended results.)\n\nWhen there are multiple matching [link reference definitions],\nthe first is used:\n" - "\n```````````````````````````````` example\n[foo]: /url1\n\n[foo]: /url2\n\n[bar][foo]\n.\n

    bar

    \n````````````````````````````````" - "\n\n\nNote that matching is performed on normalized strings, not parsed\ninline content. So the following does not match, even though the\nlabels define equivalent inline content:\n" - "\n```````````````````````````````` example\n[bar][foo\\!]\n\n[foo!]: /url\n.\n

    [bar][foo!]

    \n````````````````````````````````\n\n\n[Link labels] cannot contain brackets, unless they are\nbackslash-escaped:\n" @@ -956,15 +945,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n[foo][ref\\[]\n\n[ref\\[]: /uri\n.\n

    foo

    \n````````````````````````````````\n\n\nNote that in this example `]` is not backslash-escaped:\n" - "\n```````````````````````````````` example\n[bar\\\\]: /uri\n\n[bar\\\\]\n.\n

    bar\\

    \n````````````````````````````````\n\n\nA [link label] must contain at least one character that is not a space, tab, or\nline ending:\n" - "\n```````````````````````````````` example\n[]\n\n[]: /uri\n.\n

    []

    \n

    []: /uri

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [\n]: /uri

    \n````````````````````````````````" -- "\n\n\nA [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`" -- ".\nThe contents of the link label are parsed as inlines,\nwhich are used as the link's text. The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`.\n\n" +- "\n\n\nA [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link'" +- "s text. The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`.\n\n" - "```````````````````````````````` example\n[foo][]\n\n[foo]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[*foo* bar][]\n\n[*foo* bar]: /url \"title\"\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThe link labels are case-insensitive:\n" - "\n```````````````````````````````` example\n[Foo][]\n\n[foo]: /url \"title\"\n.\n

    Foo

    \n````````````````````````````````" - "\n\n\n\nAs with full reference links, spaces, tabs, or line endings are not\nallowed between the two sets of brackets:\n" - "\n```````````````````````````````` example\n[foo] \n[]\n\n[foo]: /url \"title\"\n.\n

    foo\n[]

    \n````````````````````````````````" -- "\n\n\nA [shortcut reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument and is not followed by `[]`" -- " or a link label.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link's text. The link's URI and title\nare provided by the matching link reference definition.\nThus, `[foo]` is equivalent to `[foo][]`.\n\n" +- "\n\n\nA [shortcut reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument and is not followed by `[]` or a link label.\nThe contents of the link label are parsed as inlines,\n" +- "which are used as the link's text. The link's URI and title\nare provided by the matching link reference definition.\nThus, `[foo]` is equivalent to `[foo][]`.\n\n" - "```````````````````````````````` example\n[foo]\n\n[foo]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[*foo* bar]\n\n[*foo* bar]: /url \"title\"\n.\n

    foo bar

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[[*foo* bar]]\n\n[*foo* bar]: /url \"title\"\n.\n

    [foo bar]

    \n````````````````````````````````" @@ -982,8 +971,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nHere `[foo]` is not parsed as a shortcut reference, because it\nis followed by a link label (even though `[bar]` is not defined):\n" - "\n```````````````````````````````` example\n[foo][bar][baz]\n\n[baz]: /url1\n[foo]: /url2\n.\n

    [foo]bar

    \n````````````````````````````````\n\n\n\n" - "## Images\n\n" -- "Syntax for images is like the syntax for links, with one\ndifference. Instead of [link text], we have an\n[image description](@). The rules for this are the\nsame as for [link text], except that (a) an\nimage description starts with `![` rather than `[`" -- ", and\n(b) an image description may contain links.\nAn image description has inline elements\nas its contents. When an image is rendered to HTML,\nthis is standardly used as the image's `alt` attribute.\n\n" +- "Syntax for images is like the syntax for links, with one\ndifference. Instead of [link text], we have an\n[image description](@). The rules for this are the\nsame as for [link text], except that (a) an\nimage description starts with `![` rather than `[`, and\n" +- "(b) an image description may contain links.\nAn image description has inline elements\nas its contents. When an image is rendered to HTML,\nthis is standardly used as the image's `alt` attribute.\n\n" - "```````````````````````````````` example\n![foo](/url \"title\")\n.\n

    \"foo\"

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n![foo *bar*]\n\n[foo *bar*]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n![foo ![bar](/url)](/url2)\n.\n

    \"foo

    \n````````````````````````````````" @@ -1010,8 +999,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nIf you want a link after a literal `!`, backslash-escape the\n`!`:\n\n```````````````````````````````` example\n\\![foo]\n\n[foo]: /url \"title\"\n.\n

    !foo

    \n````````````````````````````````\n\n\n" - "## Autolinks\n\n[Autolink](@)s are absolute URIs and email addresses inside\n`<` and `>`. They are parsed as links, with the URL or email address\nas the link label.\n" - "\nA [URI autolink](@) consists of `<`, followed by an\n[absolute URI] followed by `>`. It is parsed as\na link to the URI, with the URI as the link's label.\n" -- "\nAn [absolute URI](@),\nfor these purposes, consists of a [scheme] followed by a colon (`:`)\nfollowed by zero or more characters other than [ASCII control\ncharacters][ASCII control character], [space], `<`, and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded\n(e.g. `%20` for a space).\n\n" +- "\nAn [absolute URI](@),\nfor these purposes, consists of a [scheme] followed by a colon (`:`)\nfollowed by zero or more characters other than [ASCII control\ncharacters][ASCII control character], [space], `<`, and `>`.\n" +- "If the URI includes these characters, they must be percent-encoded\n(e.g. `%20` for a space).\n\n" - "For purposes of this spec, a [scheme](@) is any sequence\nof 2--32 characters beginning with an ASCII letter and followed\nby any combination of ASCII letters, digits, or the symbols plus\n(\"+\"), period (\".\"), or hyphen (\"-\").\n\nHere are some valid autolinks:\n" - "\n```````````````````````````````` example\n\n.\n

    http://foo.bar.baz

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n\n.\n

    https://foo.bar.baz/test?q=hello&id=22&boolean

    \n" @@ -1080,33 +1069,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\nfoo \n.\n

    foo

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n### foo\\\n.\n

    foo\\

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n### foo \n.\n

    foo

    \n````````````````````````````````\n\n\n" - "## Soft line breaks\n\n" -- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@)" -- ". (A soft line break may be rendered in HTML either as a\n[line ending] or as a space. The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)\n\n" -- "```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````\n\n\nSpaces at the end of the line and beginning of the next line are\nremoved:\n" -- "\n```````````````````````````````` example\nfoo \n baz\n.\n

    foo\nbaz

    \n````````````````````````````````\n\n\nA conforming parser may render a soft line break in HTML either as a\nline ending or as a space.\n" -- "\nA renderer may also provide an option to render soft line breaks\nas hard line breaks.\n\n" +- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@). (A soft line break may be rendered in HTML either as a\n[line ending]" +- " or as a space. The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)\n\n```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````" +- "\n\n\nSpaces at the end of the line and beginning of the next line are\nremoved:\n\n```````````````````````````````` example\nfoo \n baz\n.\n

    foo\nbaz

    \n````````````````````````````````" +- "\n\n\nA conforming parser may render a soft line break in HTML either as a\nline ending or as a space.\n\nA renderer may also provide an option to render soft line breaks\nas hard line breaks.\n\n" - "## Textual content\n\nAny characters not given an interpretation by the above rules will\nbe parsed as plain textual content.\n\n```````````````````````````````` example\nhello $.;'there\n.\n

    hello $.;'there

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\nFoo χρῆν\n.\n

    Foo χρῆν

    \n````````````````````````````````\n\n\nInternal spaces are preserved verbatim:\n" - "\n```````````````````````````````` example\nMultiple spaces\n.\n

    Multiple spaces

    \n````````````````````````````````\n\n\n\n\n" - "# Appendix: A parsing strategy\n\nIn this appendix we describe some features of the parsing strategy\nused in the CommonMark reference implementations.\n\n" - "## Overview\n\nParsing has two phases:\n\n" -- "1. In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these\n" +- "1. " +- "In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these\n" - "blocks but not parsed. Link reference definitions are parsed and a\nmap of links is constructed.\n\n" - "2. In the second phase, the raw text contents of paragraphs and headings\nare parsed into sequences of Markdown inline elements (strings,\ncode spans, links, emphasis, and so on), using the map of link\nreferences constructed in phase 1.\n\n" -- "At each point in processing, the document is represented as a tree of\n**blocks**. The root of the tree is a `document` block. The `document`\nmay have any number of other blocks as **children**" -- ". These children\nmay, in turn, have other blocks as children. The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**" -- ".)\nHere, for example, is a possible document tree, with the open blocks\nmarked by arrows:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n" -- " -> paragraph\n \"aliquando id\"\n```\n\n" +- "At each point in processing, the document is represented as a tree of\n**blocks**. The root of the tree is a `document` block. The `document`\nmay have any number of other blocks as **children**. These children\n" +- "may, in turn, have other blocks as children. The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**.)\n" +- "Here, for example, is a possible document tree, with the open blocks\nmarked by arrows:\n\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph\n" +- " \"aliquando id\"\n```\n\n" - "## Phase 1: block structure\n\nEach line that is processed has an effect on this tree. The line is\nanalyzed and, depending on its contents, the document may be altered\nin one or more of the following ways:\n\n" - "1. One or more open blocks may be closed.\n2. One or more new blocks may be created as children of the\n last open block.\n3. Text may be added to the last (deepest) open block remaining\n on the tree.\n\n" - "Once a line has been incorporated into the tree in this way,\nit can be discarded, so input can be read in a stream.\n\nFor each line, we follow this procedure:\n\n" -- "1. First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy\n" -- "if the block is to remain open. For example, a block quote requires a\n`>`" -- " character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[lazy continuation line]." -- "\n\n2. " -- "Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>`" -- " for a block quote).\nIf we encounter a new block start, we close any blocks unmatched\nin step 1 before creating the new block as a child of the last\nmatched container block.\n\n" +- "1. " +- "First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy\n" +- "if the block is to remain open. For example, a block quote requires a\n`>` character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[" +- "lazy continuation line].\n\n" +- "2. " +- "Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>` for a block quote).\nIf we encounter a new block start, we close any blocks unmatched\nin step 1 before creating the new block as a child of the last\n" +- "matched container block.\n\n" - "3. Finally, we look at the remainder of the line (after block\nmarkers like `>`, list markers, and indentation have been consumed).\nThis is text that can be incorporated into the last open\nblock (a paragraph, code block, heading, or raw HTML).\n\n" - "Setext headings are formed when we see a line of a paragraph\nthat is a [setext heading underline].\n" - "\nReference link definitions are detected when a paragraph is closed;\nthe accumulated text lines are parsed to see if they begin with\none or more reference link definitions. Any remainder becomes a\nnormal paragraph.\n" @@ -1119,11 +1110,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`paragraph`:\n\n``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n -> list_item\n -> paragraph\n \"Qui *quodsi iracundia*\"\n```" - "\n\nThe fourth line,\n\n``` markdown\n> - aliquando id\n```" - "\n\ncauses the `list_item` (and its child the `paragraph`) to be closed,\nand a new `list_item` opened up as child of the `list`. A `paragraph`\nis added as a child of the new `list_item`, to contain the text.\nWe thus obtain the final tree:\n" -- "\n``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n" -- " -> paragraph\n \"aliquando id\"\n```\n\n" +- "\n``` tree\n" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph\n" +- " \"aliquando id\"\n```\n\n" - "## Phase 2: inline structure\n\nOnce all of the input has been parsed, all open blocks are closed.\n" - "\nWe then \"walk the tree,\" visiting every node, and parse raw\nstring contents of paragraphs and headings as inlines. At this\npoint we have seen all the link reference definitions, so we can\nresolve reference links as we go.\n" -- "\n``` tree\ndocument\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph\n" +- "\n``` tree\n" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph\n" - " str \"quodsi iracundia\"\n list_item\n paragraph\n str \"aliquando id\"\n```\n\nNotice how the [line ending] in the first paragraph has\nbeen parsed as a `softbreak`, and the asterisks in the first list item\nhave become an `emph`.\n\n" - "### An algorithm for parsing nested emphasis and links\n\nBy far the trickiest part of inline parsing is handling emphasis,\nstrong emphasis, links, and images. This is done using the following\nalgorithm.\n\nWhen we're parsing inlines and we hit either\n\n" - "- a run of `*` or `_` characters, or\n- a `[` or `![`\n\n" @@ -1133,10 +1126,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "When we hit a `]` character, we call the *look for link or image*\nprocedure (see below).\n\nWhen we hit the end of the input, we call the *process emphasis*\nprocedure (see below), with `stack_bottom` = NULL.\n\n" - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter.\n\n" - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n" -- "- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n" -- "\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n" -- " + If we do, then\n" -- "\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n" +- "- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n " +- "+ If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n" +- " + If we do, then\n\n " +- "* We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n" - " * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n" - "\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n" @@ -1146,7 +1139,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n " - "+ Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n" - " + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n" -- " + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n " +- " + " +- "Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n " - "of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:\n\n " - "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n" - " + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap index ce22630..1472906 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap @@ -19,8 +19,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - org/licenses/by- - "sa/4.0/)'\n...\n\n" - "# Introduction\n\n" -- "## What is " -- "Markdown?\n\n" +- "## " +- What is Markdown +- "?\n\n" - "Markdown is a " - "plain text " - "format for " @@ -50,10 +51,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - markdown/syntax) - "\n" - "and a Perl " -- "script (`" -- "Markdown.pl`) " -- "for converting " -- "Markdown to\n" +- script ( +- "`Markdown.pl`" +- ) for converting +- " Markdown to\n" - "HTML. " - "In the next " - "decade, dozens " @@ -104,11 +105,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - its readability. - "\n" - As Gruber writes -- ":\n\n> " +- ":\n\n" +- "> " - "The overriding " - "design goal for " -- "Markdown's " -- "formatting " +- "Markdown'" +- "s formatting " - "syntax is\n> " - "to make it as " - "readable as " @@ -122,16 +124,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as-is, as\n> " - "plain text, " - "without looking " -- "like it's been " -- "marked up with " -- "tags\n> " +- "like it'" +- s been marked up +- " with tags\n> " - "or formatting " - "instructions.\n> " -- "() -- "\n\n" +- "markdown/>)\n\n" - The point can be - " illustrated by " - "comparing a " @@ -147,7 +149,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "AsciiDoc from " - "the AsciiDoc " - "manual:\n\n" -- "```\n1. " +- "```\n" +- "1. " - "List item one.\n" - "+\n" - "List item one " @@ -191,7 +194,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "And here is the " - "equivalent in " - "Markdown:\n" -- "```\n1. " +- "```\n" +- "1. " - "List item one.\n\n" - " List item " - "one continued " @@ -251,13 +255,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " in the " - "processed " - "document.\n\n" -- "## Why is a spec" -- " needed?\n\n" -- "John Gruber's [" +- "## " +- "Why is a spec " +- "needed?\n\n" +- "John Gruber's " +- "[" - "canonical " - "description of " - "Markdown's\n" -- "syntax](https://" +- syntax +- "](https://" - daringfireball.n - et/projects/ - markdown/syntax) @@ -288,25 +295,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " think that\n " - "they, too, must " - be indented four -- " spaces, but `" -- "Markdown.pl` " -- "does\n " +- " spaces, but " +- "`Markdown.pl`" +- " does\n " - not require that - ". " - This is hardly a -- " \"corner case,\" " -- "and divergences\n" -- " between " +- " \"corner case,\"" +- " and divergences" +- "\n " +- "between " - "implementations " - "on this issue " - "often lead to " - "surprises for\n" -- " users in " -- "real documents. " -- "(See [this " -- "comment by John\n" -- " Gruber](" -- "https://" +- " " +- "users in real " +- "documents. (See " +- "[" +- "this comment by " +- "John\n Gruber" +- "](https://" - web.archive.org/ - web/ - 20170611172104/ @@ -315,8 +324,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - g/ - gmane.text.markd - own.general/1997 -- ).) -- "\n\n2. " +- ").)\n\n" +- "2. " - "Is a blank line " - "needed before a " - "block quote or " @@ -336,14 +345,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "parsing (note " - "that some " - "implementations\n" -- " put the " -- "heading inside " -- "the blockquote, " +- " " +- "put the heading " +- "inside the " +- "blockquote, " - "while others do " - "not).\n " - (John Gruber has -- " also spoken [in" -- " favor of " +- " also spoken " +- "[" +- "in favor of " - "requiring the " - "blank\n lines" - "](https://" @@ -355,15 +366,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - g/ - gmane.text.markd - own.general/2146 -- ).) -- "\n\n3. " +- ").)\n\n" +- "3. " - "Is a blank line " - needed before an - " indented code " - "block?\n (" -- "`Markdown.pl` " -- "requires it, but" -- " this is not " +- "`Markdown.pl`" +- " requires it, " +- "but this is not " - mentioned in the - "\n " - "documentation, " @@ -374,13 +385,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown\n" - " paragraph\n" - " code?\n" -- " ```\n\n4. " +- " ```\n\n" +- "4. " - "What is the " - "exact rule for " - determining when - " list items get\n" -- " wrapped in `" -- "

    `" +- " wrapped in " +- "`

    `" - " tags? " - "Can a list be " - "partially \"loose" @@ -433,8 +445,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " with a thematic" - " break in its " - "second item,\n" -- " or two lists" -- " separated by a " +- " " +- "or two lists " +- "separated by a " - "thematic break?\n" - "\n" - " ``` markdown" @@ -453,9 +466,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "syntax " - "description " - "suggests two,\n" -- " but the perl" -- " scripts and " -- "many other " +- " " +- "but the perl " +- scripts and many +- " other " - "implementations " - "produce one.)\n\n" - " ``` markdown" @@ -477,8 +491,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "take precedence " - "?\n\n " - "``` markdown\n" -- " [a backtick " -- "(`)](/url) and [" +- " " +- "[a backtick (`)]" +- "(/url) and [" - another backtick - " (`)](/url).\n" - " ```\n\n" @@ -494,8 +509,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "following be " - "parsed?\n\n " - "``` markdown\n" -- " *foo *bar* " -- "baz*\n ```\n\n" +- " " +- "*foo *bar* baz*\n" +- " ```\n\n" - "10. " - "What are the " - precedence rules @@ -508,18 +524,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "following be " - "parsed?\n\n " - "``` markdown\n" -- " - `a long " -- "code span can " -- contain a hyphen -- " like this\n " +- " " +- "- `a long code " +- span can contain +- " a hyphen like " +- "this\n " - " - and it can " - "screw things up`" -- "\n ```\n\n11. " +- "\n ```\n\n" +- "11. " - "Can list items " - "include section " - headings? ( -- "`Markdown.pl` " -- "does not\n " +- "`Markdown.pl`" +- " does not\n " - "allow this, but " - "does allow " - "blockquotes to " @@ -559,19 +577,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " ``` markdown" - "\n " - "[foo]: /url1\n" -- " [foo]: /url2" -- "\n\n [foo][]\n" +- " " +- "[foo]: /url2\n\n" +- " [foo][]\n" - " ```\n\n" - "In the absence " - "of a spec, early" - " implementers " -- "consulted `" -- "Markdown.pl`\n" +- "consulted " +- "`Markdown.pl`\n" - to resolve these - " ambiguities. " -- "But `Markdown.pl" -- "` was quite " -- "buggy, and\n" +- "But " +- "`Markdown.pl`" +- " was quite buggy" +- ", and\n" - "gave manifestly " - "bad results in " - "many cases, so " @@ -604,13 +624,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "worse, because " - "nothing in " - "Markdown counts\n" -- "as a \"syntax " -- "error,\" the " -- divergence often -- " isn't " -- discovered right -- " away.\n\n" -- "## About this " +- "as a \"" +- "syntax error,\"" +- " the divergence " +- "often isn'" +- "t discovered " +- "right away.\n\n" +- "## " +- "About this " - "document\n\n" - "This document " - "attempts to " @@ -628,9 +649,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "conformance " - "tests. An\n" - "accompanying " -- "script `" -- "spec_tests.py` " -- "can be used to " +- "script " +- "`spec_tests.py`" +- " can be used to " - "run the tests\n" - "against any " - Markdown program @@ -684,9 +705,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - counts as a link - "\n" - "destination, but" -- " it doesn't " -- mandate that non -- "-ASCII " +- " it doesn'" +- "t mandate that " +- "non-ASCII " - "characters in\n" - "the URL be " - percent-encoded. @@ -718,20 +739,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "URLs.\n\n" - This document is - " generated from " -- "a text file, `" -- "spec.txt`, " -- "written\n" +- "a text file, " +- "`spec.txt`" +- ", written\n" - in Markdown with - " a small " - "extension for " - the side-by-side - " tests.\n" -- "The script `" -- tools/ -- "makespec.py` can" -- " be used to " -- "convert `" -- "spec.txt` into\n" +- "The script " +- "`tools/" +- "makespec.py`" +- " can be used to " +- "convert " +- "`spec.txt` into\n" - "HTML or " - CommonMark ( - "which can then " @@ -739,18 +760,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "into other " - "formats).\n\n" - "In the examples," -- " the `→` " -- "character is " +- " the `→`" +- " character is " - "used to " - "represent tabs.\n" - "\n" - "# Preliminaries" - "\n\n" -- "## Characters " -- "and lines\n\n" +- "## " +- "Characters and " +- "lines\n\n" - "Any sequence of " -- "[characters] is " -- "a valid " +- "[characters]" +- " is a valid " - "CommonMark\n" - "document.\n\n" - "A [character](@)" @@ -774,31 +796,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "encoding; it " - "thinks of lines " - "as composed\nof [" -- "characters] " -- "rather than " +- "characters]" +- " rather than " - "bytes. " - "A conforming " - "parser may be " - "limited\n" - "to a certain " - "encoding.\n\n" -- "A [line](@) is a" -- " sequence of " -- "zero or more [" -- "characters]\n" +- "A [line](@)" +- " is a sequence " +- "of zero or more " +- "[characters]\n" - "other than line " -- "feed (`U+000A`) " -- "or carriage " +- "feed (`U+000A`" +- ") or carriage " - "return (`U+000D`" - "),\n" - "followed by a [" -- "line ending] or " -- "by the end of " -- "file.\n\n" -- "A [line ending](" -- "@) is a line " -- "feed (`U+000A`)," -- " a carriage " +- "line ending]" +- " or by the end " +- "of file.\n\n" +- "A " +- "[line ending](@)" +- " is a line feed " +- "(`U+000A`" +- "), a carriage " - "return\n(`U+000D`" - ") not followed " - "by a line feed, " @@ -811,10 +834,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters, or a" - " line containing" - " only spaces\n(" -- "`U+0020`) or " -- "tabs (`U+0009`)," -- " is called a [" -- "blank line](@)." +- "`U+0020`" +- ) or tabs ( +- "`U+0009`" +- "), is called a " +- "[blank line](@)." - "\n\n" - "The following " - "definitions of " @@ -822,65 +846,72 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "classes will be " - "used in this " - "spec:\n\n" -- "A [Unicode " +- "A " +- "[Unicode " - "whitespace " -- "character](@) is" -- " a character in " -- "the Unicode `Zs`" -- " general\n" +- "character](@)" +- " is a character " +- "in the Unicode " +- "`Zs` general\n" - "category, or a " -- "tab (`U+0009`), " -- "line feed (`U+" -- "000A`), form " -- "feed (`U+000C`)," -- " or\n" +- "tab (`U+0009`" +- "), line feed (" +- "`U+000A`" +- "), form feed (" +- "`U+000C`), or\n" - "carriage return " - "(`U+000D`).\n\n" - "[Unicode " -- "whitespace](@) " -- is a sequence of -- " one or more\n[" +- "whitespace](@)" +- " is a sequence " +- "of one or more\n[" - "Unicode " - "whitespace " - "characters].\n\n" -- "A [tab](@) is `U" -- "+0009`.\n\n" +- "A [tab](@) is " +- "`U+0009`.\n\n" - "A [space](@) is " - "`U+0020`.\n\n" -- "An [ASCII " -- "control " -- "character](@) is" -- " a character " -- "between `U+0000–" -- "1F` (both\n" -- "including) or `U" -- "+007F`.\n\n" -- "An [ASCII " +- "An " +- "[ASCII control " +- "character](@)" +- " is a character " +- "between " +- "`U+0000–1F`" +- " (both\n" +- "including) or " +- "`U+007F`.\n\n" +- "An " +- "[ASCII " - "punctuation " - "character](@)\n" - "is `!`, `\"`, `#`" -- ", `$`, `%`, `&`," -- " `'`, `(`, `)`,\n" -- "`*`, `+`, `,`, `" -- "-`, `.`, `/`" +- ", `$`, `%`, `&`" +- ", `'`, `(`, `)`," +- "\n`*`, `+`, `,`, " +- "`-`, `.`, `/`" - " (U+0021–2F), \n" -- "`:`, `;`, `<`, `" -- "=`, `>`, `?`, " -- "`@` (U+003A–0040" -- "),\n`[`, `\\`, `]`" -- ", `^`, `_`, `` `" -- " `` (U+005B–0060" -- "), \n`{`, `|`, " -- "`}`, or `~` (U+" -- "007B–007E).\n\n" -- "A [Unicode " +- "`:`, `;`, `<`, " +- "`=`, `>`, `?`, " +- "`@`" +- " (U+003A–0040),\n" +- "`[`, `\\`, `]`, " +- "`^`, `_`, " +- "`` ` ``" +- " (U+005B–0060)," +- " \n`{`, `|`, `}`" +- ", or `~`" +- " (U+007B–007E)." +- "\n\nA " +- "[Unicode " - "punctuation " -- "character](@) is" -- " a character in " -- "the Unicode `P`" -- "\n" +- "character](@)" +- " is a character " +- "in the Unicode " +- "`P`\n" - "(puncuation) or " -- "`S` (symbol) " +- "`S`" +- " (symbol) " - "general " - "categories.\n\n" - "## Tabs\n\n" @@ -982,18 +1013,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\nNormally the " -- "`>` that begins " -- "a block quote " -- "may be followed\n" +- "`>`" +- " that begins a " +- "block quote may " +- "be followed\n" - "optionally by a " - "space, which is " - "not considered " - "part of the\n" - "content. " - In the following -- " case `>` is " -- "followed by a " -- "tab,\n" +- " case `>`" +- " is followed by " +- "a tab,\n" - which is treated - " as if it were " - "expanded into " @@ -1074,19 +1106,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Insecure " +- "## " +- "Insecure " - "characters\n\n" - "For security " - "reasons, the " - "Unicode " -- "character `U+" -- "0000` must be " +- "character " +- "`U+0000`" +- " must be " - "replaced\n" - "with the " - "REPLACEMENT " -- "CHARACTER (`U+" -- "FFFD`).\n\n\n" -- "## Backslash " +- CHARACTER ( +- "`U+FFFD`).\n\n\n" +- "## " +- "Backslash " - "escapes\n\n" - "Any ASCII " - "punctuation " @@ -1189,8 +1224,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "A backslash at " - "the end of the " -- "line is a [hard " -- "line break]:\n\n" +- "line is a [" +- "hard line break]" +- ":\n\n" - "````````````````" - "````````````````" - " example\n" @@ -1265,8 +1301,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "and link titles," - "\n" - "link references," -- " and [info " -- "strings] in [" +- " and [" +- "info strings]" +- " in [" - "fenced code " - "blocks]:\n\n" - "````````````````" @@ -1304,7 +1341,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Entity and " +- "## " +- "Entity and " - "numeric " - "character " - "references\n\n" @@ -1322,13 +1360,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with the " - "following " - "exceptions:\n\n" -- "- Entity and " +- "- " +- "Entity and " - "character " - "references are " - "not recognized " - "in code\n " - "blocks and code " -- "spans.\n\n- " +- "spans.\n\n" +- "- " - "Entity and " - "character " - "references " @@ -1344,10 +1384,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "although `*`" - " can be used\n " - "in place of a " -- "literal `*` " -- "character, `*" -- ";` cannot " -- "replace\n `*`" +- "literal `*`" +- " character, " +- "`*`" +- " cannot replace" +- "\n `*`" - " in emphasis " - "delimiters, " - "bullet list " @@ -1370,13 +1411,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - entity reference - ".\n\n" - "[Entity " -- "references](@) " -- "consist of `&` +" -- " any of the " +- "references](@)" +- " consist of `&`" +- " + any of the " - "valid\n" - "HTML5 entity " -- "names + `;`" -- ". The\ndocument " +- "names + `;`. The" +- "\ndocument " - " " -- "foo` as its " -- "literal text, " -- "you can\n" +- "heading with " +- "`> foo`" +- " as its literal " +- "text, you can\n" - "use backslash " - "escapes:\n\n" - "````````````````" @@ -2901,8 +2956,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "**Compatibility " -- "note:** Most " -- "existing " +- "note:**" +- " Most existing " - "Markdown " - "implementations\n" - do not allow the @@ -2919,21 +2974,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One can find " - "four different " - "interpretations:" -- "\n\n" -- "1. paragraph \"" +- "\n\n1. paragraph \"" - "Foo\", heading \"" - "bar\", paragraph " - "\"baz\"\n" - "2. paragraph \"" -- "Foo bar\", " -- "thematic break, " -- "paragraph \"baz\"\n" +- "Foo bar\"" +- ", thematic break" +- ", paragraph \"baz" +- "\"\n" - "3. paragraph \"" - "Foo bar --- baz\"" -- "\n" -- "4. heading \"Foo " -- "bar\", paragraph " -- "\"baz\"\n\n" +- "\n4. heading \"" +- "Foo bar\"" +- ", paragraph \"baz" +- "\"\n\n" - "We find " - interpretation 4 - " most natural, " @@ -2984,8 +3039,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that cannot " - "count as a [" - "setext heading\n" -- "underline], such" -- " as\n\n" +- "underline]" +- ", such as\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3010,19 +3065,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Indented code" -- " blocks\n\n" -- "An [indented " -- "code block](@) " -- "is composed of " +- "## " +- "Indented code " +- "blocks\n\n" +- "An " +- "[indented code " +- "block](@)" +- " is composed of " - "one or more\n[" - "indented chunks]" - " separated by " - "blank lines.\nAn " - "[indented chunk]" -- "(@) is a " -- sequence of non- -- "blank lines,\n" +- (@) +- " is a sequence " +- "of non-blank " +- "lines,\n" - each preceded by - " four or more " - "spaces of " @@ -3034,8 +3092,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " of the lines, " - "including " - "trailing\n[" -- "line endings], " -- "minus four " +- "line endings]" +- ", minus four " - "spaces of " - "indentation.\n" - An indented code @@ -3082,8 +3140,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "indicating that " - material belongs - " to a [list\nitem" -- "][list items], " -- "the list item " +- "][list items]" +- ", the list item " - "interpretation " - takes precedence - ":\n\n" @@ -3267,8 +3325,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Trailing spaces " - "or tabs are " - "included in the " -- "code block's " -- "content:\n\n" +- "code block'" +- "s content:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3278,17 +3336,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n\n" -- "## Fenced code " +- "## " +- "Fenced code " - "blocks\n\n" -- "A [code fence](@" -- ") is a sequence\n" +- "A " +- "[code fence](@)" +- " is a sequence\n" - "of at least " - "three " - "consecutive " - "backtick " -- "characters (`` `" -- " ``) or\ntildes (" -- "`~`" +- characters ( +- "`` ` ``) or\n" +- "tildes (`~`" - "). " - "(Tildes and " - backticks cannot @@ -3313,10 +3373,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "leading and " - "trailing\n" - "spaces or tabs " -- "and called the [" -- "info string](@)." -- " If the [info " -- "string] comes\n" +- "and called the " +- "[info string](@)" +- ". If the [" +- "info string]" +- " comes\n" - after a backtick - " fence, it may " - "not contain any " @@ -3339,10 +3400,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "consists of all " - subsequent lines - ", until\n" -- "a closing [code " -- "fence] of the " -- same type as the -- " code block\n" +- "a closing [" +- "code fence]" +- " of the same " +- type as the code +- " block\n" - began with ( - "backticks or " - "tildes), and " @@ -3438,16 +3500,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "not parsed\n" - "as inlines. " - "The first word " -- "of the [info " -- "string] is " -- "typically used " -- "to\n" +- "of the [" +- "info string]" +- " is typically " +- "used to\n" - "specify the " - "language of the " - "code sample, and" - " rendered in the" -- " `class`" -- "\n" +- " `class`\n" - attribute of the - " `code`" - " tag. " @@ -3555,11 +3616,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "end of the " - "document\n" - "(or the " -- "enclosing [block" -- " quote][block " -- "quotes] or [list" -- " item][list " -- "items]):\n\n" +- "enclosing [" +- "block quote][" +- "block quotes]" +- " or [list item][" +- "list items]):\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3788,14 +3849,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\nAn [" -- "info string] can" -- " be provided " -- "after the " +- "info string]" +- " can be provided" +- " after the " - "opening code " - "fence.\n" - "Although this " -- "spec doesn't " -- "mandate any " +- "spec doesn'" +- "t mandate any " - "particular " - "treatment of\n" - "the info string," @@ -3809,12 +3870,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "normally " - "indicated by " - "adding a class " -- "to the `code` " -- "element " +- "to the `code`" +- " element " - "consisting\nof " -- "`language-` " -- "followed by the " -- "language name.\n\n" +- "`language-`" +- " followed by the" +- " language name.\n" +- "\n" - "````````````````" - "````````````````" - " example\n" @@ -3855,9 +3917,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code>

    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Info strings] " -- "for backtick " +- "\n\n\n[Info strings" +- "]" +- " for backtick " - "code blocks " - "cannot contain " - "backticks:\n\n" @@ -3870,9 +3932,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code>\nfoo

    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Info strings] " -- "for tilde code " +- "\n\n\n[Info strings" +- "]" +- " for tilde code " - "blocks can " - "contain " - "backticks and " @@ -3891,8 +3953,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "Closing code " - "fences cannot " -- "have [info " -- "strings]:\n\n" +- "have [" +- "info strings]:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3905,196 +3967,213 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n\n" - "## HTML blocks\n\n" -- "An [HTML block](" -- "@) is a group of" -- " lines that is " +- "An " +- "[HTML block](@)" +- " is a group of " +- "lines that is " - "treated\n" - as raw HTML (and - " will not be " - "escaped in HTML " - "output).\n\n" - "There are seven " -- "kinds of [HTML " -- "block], which " -- "can be defined " -- "by their\n" +- "kinds of [" +- "HTML block]" +- ", which can be " +- defined by their +- "\n" - "start and end " - "conditions. " - The block begins - " with a line " - "that meets a\n" - "[start condition" -- "](@) (after up " -- "to three " -- "optional spaces " -- of indentation). -- "\n" +- "](@)" +- " (after up to " +- "three optional " +- "spaces of " +- "indentation).\n" - It ends with the - " first " - "subsequent line " - "that meets a " - "matching\n" - "[end condition](" -- "@), or the last " +- "@)" +- ", or the last " - "line of the " - "document, or the" - " last line of\n" -- "the [container " -- "block](#" -- container-blocks -- ) containing the -- " current HTML\n" +- "the " +- "[container block" +- "](#container-" +- blocks) +- " containing the " +- "current HTML\n" - "block, if no " - "line is " - encountered that -- " meets the [end " -- "condition]. If\n" +- " meets the [" +- "end condition]" +- ". If\n" - "the first line " - "meets both the [" - "start condition]" - " and the [end\n" -- "condition], the " -- "block will " -- "contain just " -- "that line.\n\n1. " +- "condition]" +- ", the block will" +- " contain just " +- "that line.\n\n" +- "1. " - "**Start " -- "condition:** " -- line begins with -- " the string `<" -- "pre`,\n``, or " -- "the end of the " -- "line.\\\n" +- "string `>`" +- ", or the end of " +- "the line.\\\n" - "**End condition:" -- "** line " -- "contains an end " -- "tag\n`
    `, " -- "``, ``, or `` (case" -- "-insensitive; it" -- "\n" +- "**" +- " line contains " +- "an end tag\n" +- "`
    `, " +- "``, " +- "``, or " +- "``" +- " (case-" +- "insensitive; it\n" - "need not match " - "the start tag).\n" - "\n2. " - "**Start " -- "condition:** " -- line begins with -- " the string ``.\n\n" +- "**" +- " line contains " +- "the string `-->`" +- ".\n\n" - "3. " - "**Start " -- "condition:** " -- line begins with -- " the string ``" -- ".\n\n" -- "4. " +- "**" +- " line contains " +- "the string `?>`." +- "\n\n4. " - "**Start " -- "condition:** " -- line begins with -- " the string ``.\n\n" +- "**" +- " line contains " +- "the character " +- "`>`.\n\n" - "5. " - "**Start " -- "condition:** " -- line begins with -- " the string\n" +- "condition:**" +- " line begins " +- "with the string\n" - "`" -- "`.\n\n" +- "**" +- " line contains " +- "the string `]]>`" +- ".\n\n" - "6. " - "**Start " -- "condition:** " -- line begins with -- " the string `<` " -- "or ``" -- ", or\nthe string " -- "`/>`.\\\n" +- "string `>`, or\n" +- "the string `/>`." +- "\\\n" - "**End condition:" -- "** line is " +- "**" +- " line is " - "followed by a [" -- "blank line]." -- "\n\n7. " +- "blank line].\n\n" +- "7. " - "**Start " -- "condition:** " -- line begins with -- " a complete [" -- "open tag]\n" -- "(with any [tag " -- "name] other than" -- " `pre`, `script`" -- ",\n`style`, or " -- "`textarea`) or a" -- " complete [" -- "closing tag],\n" +- "condition:**" +- " line begins " +- "with a complete " +- "[open tag]\n" +- "(with any [" +- "tag name]" +- " other than " +- "`pre`, `script`," +- "\n`style`, or " +- "`textarea`" +- ") or a complete " +- "[closing tag],\n" - followed by zero - " or more spaces " - "and tabs, " @@ -4102,26 +4181,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - end of the line. - "\\\n" - "**End condition:" -- "** line is " +- "**" +- " line is " - "followed by a [" -- "blank line]." -- "\n\n" +- "blank line].\n\n" - "HTML blocks " - "continue until " - "they are closed " - "by their " - "appropriate\n[" -- "end condition], " -- or the last line -- " of the document" -- " or other [" -- "container\nblock" +- "end condition]" +- ", or the last " +- "line of the " +- "document or " +- "other " +- "[container\nblock" - "](#container-" - blocks) - ". " - "This means any " -- HTML **within an -- " HTML\nblock**" +- "HTML " +- "**within an HTML" +- "\nblock**" - " that might " - "otherwise be " - "recognised as a " @@ -4132,14 +4213,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "passed through " - "as-is, without " - "changing\n" -- "the parser's " -- "state.\n\n" -- "For instance, `<" -- "pre>` within an " -- "HTML block " -- "started by `<" -- "table>` will not" -- " affect\n" +- "the parser'" +- "s state.\n\n" +- "For instance, " +- "`
    `"
    +- " within an HTML "
    +- block started by
    +- " ``"
    +- " will not affect"
    +- "\n"
     - the parser state
     - "; as the HTML "
     - "block was "
    @@ -4173,20 +4255,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "the HTML block "
     - is terminated by
     - " the blank line "
    -- "— the `**Hello**"
    -- "`\n"
    +- "— the "
    +- "`**Hello**`\n"
     - "text remains "
     - "verbatim — and "
     - "regular parsing "
     - "resumes, with a "
     - "paragraph,\n"
    -- "emphasised `"
    -- "world` and "
    -- inline and block
    -- " HTML following."
    -- "\n\nAll types of ["
    -- "HTML blocks] "
    -- "except type 7 "
    +- "emphasised "
    +- "`world`"
    +- " and inline and "
    +- "block HTML "
    +- "following.\n\n"
    +- "All types of ["
    +- "HTML blocks]"
    +- " except type 7 "
     - "may interrupt\n"
     - "a paragraph.  "
     - Blocks of type 7
    @@ -4342,9 +4425,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "\n\n\n"
     - "The initial tag "
    -- "doesn't even "
    -- "need to be a "
    -- "valid\n"
    +- "doesn'"
    +- "t even need to "
    +- "be a valid\n"
     - "tag, as long as "
     - "it starts like "
     - "one:\n\n"
    @@ -4421,9 +4504,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````"
     - "\n\n\nTo start an ["
    -- "HTML block] with"
    -- " a tag that is *"
    -- "not* in the\n"
    +- "HTML block]"
    +- " with a tag that"
    +- " is *not* in the"
    +- "\n"
     - list of block-
     - level tags in (6
     - "), you must put "
    @@ -4482,20 +4566,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - either block-
     - level or inline-
     - "level tags.\nThe "
    -- "`` tag is a"
    -- " nice example.  "
    +- "``"
    +- " tag is a nice "
    +- "example.  "
     - "We can surround "
     - "content with\n"
    -- "`` tags in "
    -- "three different "
    -- "ways.  "
    -- "In this case, we"
    -- " get a raw\n"
    +- "``"
    +- " tags in three "
    +- different ways.
    +- "  In this case, "
    +- "we get a raw\n"
     - "HTML block, "
    -- "because the `<"
    -- "del>` tag is on "
    -- a line by itself
    -- ":\n\n"
    +- "because the "
    +- "``"
    +- " tag is on a "
    +- "line by itself:\n"
    +- "\n"
     - "````````````````"
     - "````````````````"
     - " example\n"
    @@ -4509,9 +4595,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - " get a raw HTML "
     - "block that just "
     - "includes\nthe "
    -- "`` tag ("
    -- "because it ends "
    -- "with the "
    +- "``"
    +- " tag (because it"
    +- " ends with the "
     - "following blank\n"
     - "line).  "
     - "So the contents "
    @@ -4528,12 +4614,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "\n\n\n"
     - "Finally, in this"
    -- " case, the `` tags are "
    +- " case, the "
    +- "``"
    +- " tags are "
     - "interpreted\nas ["
    -- "raw HTML] *"
    -- "inside* the "
    -- "CommonMark "
    +- "raw HTML] "
    +- "*inside*"
    +- " the CommonMark "
     - "paragraph.  "
     - "(Because\n"
     - "the tag is not "
    @@ -4556,9 +4643,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "designed to "
     - "contain literal "
     - "content\n(`pre`, "
    -- "`script`, `style"
    -- "`, `textarea`), "
    -- "comments, "
    +- "`script`, "
    +- "`style`, "
    +- "`textarea`"
    +- "), comments, "
     - "processing "
     - "instructions,\n"
     - and declarations
    @@ -4864,10 +4952,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "\n\n\n"
     - An HTML block of
    -- " types 1--6 can "
    -- "interrupt a "
    -- "paragraph, and "
    -- "need not be\n"
    +- " types 1--"
    +- "6 can interrupt "
    +- "a paragraph, and"
    +- " need not be\n"
     - "preceded by a "
     - "blank line.\n\n"
     - "````````````````"
    @@ -4918,22 +5006,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "\n\n\n"
     - "This rule "
     - "differs from "
    -- "John Gruber's "
    -- "original "
    +- "John Gruber'"
    +- "s original "
     - "Markdown syntax\n"
     - "specification, "
    -- "which says:\n\n> "
    +- "which says:\n\n"
    +- "> "
     - "The only "
     - restrictions are
     - " that block-"
     - "level HTML "
     - "elements —\n> "
    -- "e.g. `
    `, `<" -- "table>`, `
    `"
    -- ", `

    `, etc. — " -- "must be " -- "separated from\n" -- "> surrounding " +- "e.g. `

    `, " +- "`
    `, " +- "`
    `, `

    `" +- ", etc. — must be" +- " separated from\n" +- "> " +- "surrounding " - content by blank - " lines, and the " - "start and end " @@ -4943,21 +5033,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with spaces or " - "tabs.\n\n" - "In some ways " -- "Gruber's rule is" -- " more " +- "Gruber'" +- "s rule is more " - restrictive than - " the one given\n" - "here:\n\n" -- "- It requires " -- "that an HTML " -- "block be " -- "preceded by a " -- "blank line.\n" -- "- It does not " +- "- " +- It requires that +- " an HTML block " +- be preceded by a +- " blank line.\n" +- "- " +- "It does not " - "allow the start " - "tag to be " - "indented.\n" -- "- It requires a " +- "- " +- "It requires a " - matching end tag - ", which it also " - "does not allow " @@ -4966,16 +5058,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Most Markdown " - "implementations " - "(including some " -- "of Gruber's own)" -- " do not\n" +- "of Gruber'" +- "s own) do not\n" - "respect all of " - "these " - "restrictions.\n\n" - "There is one " - "respect, however" - ", in which " -- "Gruber's rule is" -- " more liberal\n" +- "Gruber'" +- "s rule is more " +- "liberal\n" - "than the one " - "given here, " - "since it allows " @@ -5072,8 +5165,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "documents with " - 100% reliability - ". However,\n" -- "*in most cases* " -- "this will work " +- "*in most cases*" +- " this will work " - "fine, because " - "the blank lines " - "in\n" @@ -5130,54 +5223,59 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "can be\n" - "deleted. " - The exception is -- " inside `

    ` "
    -- "tags, but as "
    +- " inside `
    `"
    +- " tags, but as "
     - "described\n[above"
    -- "][HTML blocks], "
    -- "raw HTML blocks "
    -- "starting with `<"
    -- "pre>`\n*can*"
    +- "][HTML blocks]"
    +- ", raw HTML "
    +- "blocks starting "
    +- "with `
    `\n"
    +- "*can*"
     - " contain blank "
     - "lines.\n\n"
    -- "## Link "
    -- "reference "
    +- "## "
    +- "Link reference "
     - "definitions\n\n"
    -- "A [link "
    -- "reference "
    +- "A "
    +- "[link reference "
     - "definition](@)\n"
     - "consists of a ["
    -- "link label], "
    -- "optionally "
    +- "link label]"
    +- ", optionally "
     - "preceded by up "
     - "to three spaces "
     - "of\n"
     - "indentation, "
     - "followed\n"
    -- "by a colon (`:`)"
    -- ", optional "
    +- "by a colon (`:`"
    +- "), optional "
     - spaces or tabs (
     - "including up to "
     - "one\n[line ending"
    -- "]), a [link "
    -- "destination],\n"
    +- "]), a ["
    +- link destination
    +- "],\n"
     - "optional spaces "
     - or tabs (
     - "including up to "
     - "one\n[line ending"
    -- "]), and an "
    +- "]"
    +- "), and an "
     - "optional [link\n"
    -- "title], which if"
    -- " it is present "
    -- "must be "
    -- "separated\n"
    -- "from the [link "
    -- "destination] by "
    -- "spaces or tabs.\n"
    +- "title]"
    +- ", which if it is"
    +- " present must be"
    +- " separated\n"
    +- "from the ["
    +- link destination
    +- "]"
    +- " by spaces or "
    +- "tabs.\n"
     - "No further "
     - "character may "
     - "occur.\n\n"
    -- "A [link "
    -- "reference "
    +- "A ["
    +- "link reference "
     - "definition]\n"
     - "does not "
     - "correspond to a "
    @@ -5191,15 +5289,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "reference links]"
     - "\n"
     - and reference-
    -- "style [images] "
    -- elsewhere in the
    -- " document.  ["
    +- "style [images]"
    +- " elsewhere in "
    +- "the document.  ["
     - "Link\n"
     - "reference "
    -- "definitions] can"
    -- " come either "
    -- "before or after "
    -- "the links that "
    +- "definitions]"
    +- " can come either"
    +- " before or after"
    +- " the links that "
     - "use\nthem.\n\n"
     - "````````````````"
     - "````````````````"
    @@ -5406,11 +5504,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "\n\n\n"
     - "As noted in the "
     - "section on ["
    -- "Links], matching"
    -- " of labels is\n"
    +- "Links]"
    +- ", matching of "
    +- "labels is\n"
     - case-insensitive
    -- " (see [matches])"
    -- ".\n\n"
    +- " (see [matches]"
    +- ").\n\n"
     - "````````````````"
     - "````````````````"
     - " example\n"
    @@ -5541,12 +5640,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "

    [foo]

    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "A [link " -- "reference " -- "definition] " -- cannot interrupt -- " a paragraph.\n\n" +- "\n\n\nA [" +- "link reference " +- "definition]" +- " cannot " +- "interrupt a " +- "paragraph.\n\n" - "````````````````" - "````````````````" - " example\n" @@ -5632,8 +5731,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n[" - "Link reference " -- "definitions] can" -- " occur\n" +- "definitions]" +- " can occur\n" - "inside block " - "containers, like" - " lists and block" @@ -5664,17 +5763,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "interpreted as " - "other\n" - "kinds of blocks " -- "forms a [" -- "paragraph](@).\n" +- "forms a " +- "[paragraph](@).\n" - "The contents of " - "the paragraph " - "are the result " - "of parsing the\n" -- "paragraph's raw " -- "content as " -- "inlines. " -- "The paragraph's " -- "raw content\n" +- "paragraph'" +- s raw content as +- " inlines. " +- "The paragraph'" +- "s raw content\n" - "is formed by " - "concatenating " - "the lines and " @@ -5789,8 +5888,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that ends with " - "two or more " - "spaces will not " -- "end with a [hard" -- " line\nbreak]:\n\n" +- "end with a [" +- "hard line\nbreak]" +- ":\n\n" - "````````````````" - "````````````````" - " example\n" @@ -5802,8 +5902,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "## Blank lines\n\n" -- "[Blank lines] " -- between block- +- "[Blank lines]" +- " between block-" - "level elements " - "are ignored,\n" - "except for the " @@ -5827,13 +5927,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n\n" -- "# Container " -- "blocks\n\n" -- "A [container " -- "block](#" -- container-blocks -- ") is a block " -- "that has other\n" +- "# " +- Container blocks +- "\n\nA " +- "[container block" +- "](#container-" +- blocks) +- " is a block that" +- " has other\n" - "blocks as its " - "contents. " - "There are two " @@ -5852,7 +5953,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The general\n" - "form of the " - "definition is:\n\n" -- "> If X is a " +- "> " +- "If X is a " - "sequence of " - "blocks, then the" - " result of\n> " @@ -5870,8 +5972,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "list item by " - "explaining\n" - how these can be -- " *generated* " -- "from their " +- " *generated*" +- " from their " - "contents. " - "This should " - "suffice\n" @@ -5879,8 +5981,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "syntax, although" - " it does not " - "give a recipe " -- for *parsing* -- "\n" +- "for *parsing*\n" - "these " - "constructions. " - "(A recipe is " @@ -5901,65 +6002,72 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to three spaces " - "of indentation,\n" - "consists of (a) " -- "the character `>" -- "` together with " -- "a following " -- "space of\n" +- "the character " +- "`>`" +- " together with a" +- " following space" +- " of\n" - "indentation, or " - "(b) a single " -- "character `>` " -- "not followed by " -- "a space of\n" +- "character `>`" +- " not followed by" +- " a space of\n" - "indentation.\n\n" - "The following " - "rules define [" -- "block quotes]:" -- "\n\n1. " +- "block quotes]:\n\n" +- "1. " - "**Basic case.**" - " If a string of" -- " lines *Ls* " -- "constitute a " +- " lines *Ls*" +- " constitute a " - "sequence\n " -- "of blocks *Bs*, " -- "then the result " -- "of prepending a " -- "[block quote" -- "\n marker]" +- of blocks *Bs* +- ", then the " +- "result of " +- "prepending a [" +- "block quote\n " +- "marker]" - " to the " - "beginning of " -- each line in *Ls -- "*\n is a " +- "each line in " +- "*Ls*\n is a " - "[block quote](#" -- "block-quotes) " -- containing *Bs*. -- "\n\n2. " +- block-quotes) +- " containing *Bs*" +- ".\n\n" +- "2. " - "**Laziness.**" - " If a string of" -- " lines *Ls* " -- "constitute a [" -- "block\n quote" +- " lines *Ls*" +- " constitute a " +- "[block\n quote" - "](#block-quotes)" -- " with contents *" -- "Bs*, then the " +- " with contents " +- "*Bs*" +- ", then the " - "result of " - "deleting\n " - "the initial [" - "block quote " -- "marker] from one" -- " or\n " +- "marker]" +- " from one or" +- "\n " - "more lines in " - "which the next " - "character other " - "than a space or " - "tab after the\n" -- " [block quote" -- " marker] is [" +- " [" +- "block quote " +- "marker] is [" - "paragraph " - "continuation\n" -- " text] is a " -- block quote with -- " *Bs* as its " -- "content.\n " +- " text]" +- " is a block " +- quote with *Bs* +- " as its content." +- "\n " - "[Paragraph " - "continuation " - "text](@) is text" @@ -5975,16 +6083,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "3. " - "**" - Consecutiveness. -- "** A document " +- "**" +- " A document " - "cannot contain " - "two [block\n " -- "quotes] in a row" -- " unless there is" -- " a [blank line] " -- "between them.\n\n" +- "quotes]" +- " in a row unless" +- " there is a [" +- "blank line]" +- " between them.\n\n" - "Nothing else " -- "counts as a [" -- "block quote](#" +- "counts as a " +- "[block quote](#" - "block-quotes).\n\n" - Here is a simple - " example:\n\n" @@ -6001,8 +6111,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - The space or tab -- " after the `>` " -- "characters can " +- " after the `>`" +- " characters can " - "be omitted:\n\n" - "````````````````" - "````````````````" @@ -6015,9 +6125,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "The `>` " -- "characters can " +- "\n\n\nThe `>`" +- " characters can " - "be preceded by " - "up to three " - "spaces of " @@ -6098,7 +6207,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "block quote " - "markers].\n" - "For example, the" -- " `> ` cannot be " +- " `> `" +- " cannot be " - "omitted in the " - "second line of\n\n" - "``` markdown\n" @@ -6118,8 +6228,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Similarly, if we" -- " omit the `> ` " -- "in the second " +- " omit the `> `" +- " in the second " - "line of\n\n" - "``` markdown\n" - "> - foo\n> - bar\n" @@ -6144,9 +6254,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "For the same " -- "reason, we can't" -- " omit the `> ` " -- "in front of\n" +- "reason, we can'" +- "t omit the `> `" +- " in front of\n" - subsequent lines - " of an indented " - "or fenced code " @@ -6212,8 +6322,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "cannot\n" - "interrupt " - "paragraphs, so " -- "it is [paragraph" -- " continuation " +- "it is [" +- "paragraph " +- "continuation " - "text].\n\n" - "A block quote " - "can be empty:\n\n" @@ -6270,11 +6381,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "implementations," - " including John " - "Gruber's\n" -- "original `" -- "Markdown.pl`, " -- "will parse this " -- "example as a " -- "single block " +- "original " +- "`Markdown.pl`" +- ", will parse " +- "this example as " +- "a single block " - "quote\n" - "with two " - "paragraphs. " @@ -6395,9 +6506,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the Laziness " - "rule that any " - "number\n" -- "of initial `>`s " -- "may be omitted " -- "on a " +- "of initial `>`" +- s may be omitted +- " on a " - "continuation " - "line of a\n" - "nested block " @@ -6437,12 +6548,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " block in a " - "block quote,\n" - "remember that " -- "the [block quote" -- " marker] " -- "includes\n" -- "both the `>` and" -- " a following " -- "space of " +- "the [" +- "block quote " +- "marker] includes" +- "\nboth the `>`" +- " and a following" +- " space of " - "indentation. " - So *five spaces* - " are needed\n" @@ -6463,22 +6574,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n\n" - "## List items\n\n" -- "A [list marker](" -- "@) is a\n[" +- "A " +- "[list marker](@)" +- " is a\n[" - "bullet list " - "marker] or an [" - "ordered list " - "marker].\n\n" -- "A [bullet list " +- "A " +- "[bullet list " - "marker](@)\nis a " - "`-`, `+`, or `*`" - " character.\n\n" -- "An [ordered list" -- " marker](@)\n" +- "An " +- "[ordered list " +- "marker](@)\n" - is a sequence of -- " 1--9 arabic " -- "digits (`0-9`), " -- "followed by " +- " 1--" +- "9 arabic digits " +- "(`0-9`" +- "), followed by " - "either a\n`.`" - " character or a " - "`)`" @@ -6494,39 +6609,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ".)\n\n" - "The following " - "rules define [" -- "list items]:" -- "\n\n1. " +- "list items]:\n\n" +- "1. " - "**Basic case.**" - " If a sequence " -- "of lines *Ls* " -- "constitute a " +- of lines *Ls* +- " constitute a " - "sequence of\n " -- "blocks *Bs* " -- "starting with a " -- "character other " -- "than a space or " -- "tab, and *M*" -- " is\n " +- blocks *Bs* +- " starting with a" +- " character other" +- " than a space or" +- " tab, and *M* is" +- "\n " - a list marker of -- " width *W* " -- "followed by 1 ≤ " -- "*N* ≤ 4 spaces " -- "of indentation,\n" -- " then the " -- "result of " -- "prepending *M* " -- "and the " +- " width *W*" +- " followed by 1 ≤" +- " *N*" +- " ≤ 4 spaces of " +- "indentation,\n" +- " " +- "then the result " +- "of prepending " +- "*M*" +- " and the " - following spaces - " to the first " - "line\n of *Ls*" - ", and indenting " - subsequent lines -- " of *Ls* by *W +" -- " N* spaces, is a" +- " of *Ls* by " +- "*W + N*" +- " spaces, is a" - "\n " -- list item with * -- "Bs* as its " -- "contents. " +- "list item with " +- "*Bs*" +- " as its contents" +- ". " - "The type of the " - "list item\n " - "(bullet or " @@ -6534,11 +6653,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "determined by " - "the type of its " - "list marker.\n" -- " If the list " -- "item is ordered," -- " then it is also" -- " assigned a " -- "start\n " +- " " +- If the list item +- " is ordered, " +- "then it is also " +- assigned a start +- "\n " - "number, based on" - " the ordered " - "list marker.\n\n" @@ -6552,32 +6672,33 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that is, when it" - " starts on a " - "line that would\n" -- " otherwise" -- " count as [" +- " " +- "otherwise count " +- "as [" - "paragraph " - "continuation " - "text]---then (a)" - "\n " -- "the lines *Ls* " -- "must not begin " +- the lines *Ls* +- " must not begin " - "with a blank " - "line, and (b) if" - "\n " - the list item is - " ordered, the " - "start number " -- "must be 1.\n" -- " 2. " +- "must be 1.\n " +- "2. " - If any line is a - " [thematic break" -- "][thematic " -- "breaks] then" -- "\n " +- "][" +- "thematic breaks]" +- " then\n " - that line is not - " a list item.\n\n" - "For example, let" -- " *Ls* be the " -- "lines\n\n" +- " *Ls*" +- " be the lines\n\n" - "````````````````" - "````````````````" - " example\n" @@ -6601,9 +6722,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\nAnd let *M*" -- " be the marker `" -- "1.`, and *N* = 2" -- ". " +- " be the marker " +- "`1.`, and *N*" +- " = 2. " - "Then rule #1 " - "says\n" - "that the " @@ -6777,8 +6898,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " occurs in the " - "same column as " - "the list marker " -- "`1.`" -- ",\n" +- "`1.`,\n" - "but is actually " - contained in the - " list item, " @@ -6794,14 +6914,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "also possible. " - In the following - " example, the " -- "word `two`" -- "\n" +- "word `two`\n" - "occurs far to " - the right of the - " initial text of" - " the list item, " -- "`one`" -- ", but\n" +- "`one`, but\n" - "it is not " - "considered part " - of the list item @@ -6979,36 +7097,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n " - "constitute a " - "sequence of " -- "blocks *Bs* " -- starting with an -- " indented code\n" -- " block, and *" -- "M* is a list " +- blocks *Bs* +- " starting with " +- an indented code +- "\n block, and " +- "*M*" +- " is a list " - "marker of width " -- "*W*" -- " followed by" +- "*W* followed by" - "\n " - "one space of " - "indentation, " - "then the result " -- of prepending *M -- "* and the\n " +- "of prepending " +- "*M* and the\n " - "following space " - "to the first " -- "line of *Ls*, " -- "and indenting " +- line of *Ls* +- ", and indenting " - subsequent lines - "\n of *Ls* by " -- "*W + 1* spaces, " -- "is a list item " -- with *Bs* as its -- " contents.\n " +- "*W + 1*" +- " spaces, is a " +- "list item with " +- "*Bs*" +- " as its contents" +- ".\n " - "If a line is " - "empty, then it " - "need not be " - "indented. " - "The type of the\n" -- " list item (" +- " " +- list item ( - "bullet or " - "ordered) is " - "determined by " @@ -7066,15 +7187,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\nIf the " -- "*first* block in" -- " the list item " -- "is an indented " -- "code block,\n" +- "*first*" +- " block in the " +- "list item is an " +- "indented code " +- "block,\n" - "then by rule #2," - " the contents " - must be preceded -- " by *one* space " -- "of indentation\n" +- " by *one*" +- " space of " +- "indentation\n" - "after the list " - "marker:\n\n" - "````````````````" @@ -7231,36 +7354,41 @@ input_file: tests/inputs/markdown/commonmark_spec.md - of lines *Ls* - "\n " - "starting with a " -- "single [blank " -- "line] constitute" -- " a (possibly " -- "empty)\n " +- "single [" +- "blank line]" +- " constitute a (" +- "possibly empty)\n" +- " " - "sequence of " -- "blocks *Bs*, and" -- " *M* is a list " +- blocks *Bs* +- ", and *M*" +- " is a list " - "marker of width " -- "*W*" -- ",\n " +- "*W*,\n " - "then the result " -- of prepending *M -- "* to the first " -- "line of *Ls*, " -- "and\n " +- "of prepending " +- "*M*" +- " to the first " +- line of *Ls* +- ", and\n " - "preceding " - subsequent lines -- " of *Ls* by *W +" -- " 1* spaces of " +- " of *Ls* by " +- "*W + 1*" +- " spaces of " - "indentation, is " - "a\n " -- list item with * -- "Bs* as its " -- "contents.\n " +- "list item with " +- "*Bs*" +- " as its contents" +- ".\n " - "If a line is " - "empty, then it " - "need not be " - "indented. " - "The type of the\n" -- " list item (" +- " " +- list item ( - "bullet or " - "ordered) is " - "determined by " @@ -7305,8 +7433,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces\n" - "following the " - "list marker " -- "doesn't change " -- "the required " +- "doesn'" +- "t change the " +- "required " - "indentation:\n\n" - "````````````````" - "````````````````" @@ -7323,8 +7452,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "most one blank " - "line.\n" - In the following -- " example, `foo` " -- "is not part of " +- " example, `foo`" +- " is not part of " - "the list\nitem:\n\n" - "````````````````" - "````````````````" @@ -7410,8 +7539,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n4. " - "**Indentation.**" - " If a sequence " -- "of lines *Ls* " -- "constitutes a " +- of lines *Ls* +- " constitutes a " - "list item\n " - "according to " - "rule #1, #2, or " @@ -7430,8 +7559,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "contents and " - "attributes. " - "If a line is\n" -- " empty, then " -- "it need not be " +- " " +- "empty, then it " +- "need not be " - "indented.\n\n" - "Indented one " - "space:\n\n" @@ -7545,12 +7675,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n\n5. " - "**Laziness.**" - " If a string of" -- " lines *Ls* " -- "constitute a [" -- "list\n item" -- "](#list-items) " -- with contents * -- "Bs*, then the " +- " lines *Ls*" +- " constitute a " +- "[list\n item" +- "](#list-items)" +- " with contents " +- "*Bs*" +- ", then the " - "result of " - "deleting\n " - "some or all of " @@ -7563,7 +7694,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "space or tab " - "after the " - "indentation is\n" -- " [paragraph " +- " [" +- "paragraph " - "continuation " - "text] is a\n " - "list item with " @@ -7571,8 +7703,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "contents and " - "attributes. " - "The unindented\n" -- " lines are " -- "called\n " +- " " +- lines are called +- "\n " - "[lazy " - "continuation " - "line](@)s.\n\n" @@ -7670,9 +7803,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " not counted as " - "a list item by " - "rules\n #1--" -- "5 counts as a [" -- "list item](#list" -- "-items).\n\n" +- "5 counts as a " +- "[list item](#" +- "list-items).\n\n" - "The rules for " - "sublists follow " - from the general @@ -7805,12 +7938,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "### Motivation\n\n" -- "John Gruber's " -- "Markdown spec " +- "John Gruber'" +- "s Markdown spec " - "says the " - "following about " - "list items:\n\n" -- "1. \"" +- "1. " +- "\"" - "List markers " - "typically start " - "at the left " @@ -7821,29 +7955,34 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "List markers " - must be followed - " by one or more\n" -- " spaces or a " -- "tab.\"\n\n" -- "2. \"" +- " " +- spaces or a tab. +- "\"\n\n" +- "2. " +- "\"" - "To make lists " - "look nice, you " - "can wrap items " - "with hanging " - "indents....\n " -- "But if you don't" -- " want to, you " +- "But if you don'" +- "t want to, you " - "don't have to.\"" -- "\n\n3. \"" +- "\n\n3. " +- "\"" - "List items may " - "consist of " - "multiple " - "paragraphs. " - "Each subsequent\n" -- " paragraph in " -- a list item must -- " be indented by " +- " " +- "paragraph in a " +- "list item must " +- "be indented by " - "either 4 spaces " - "or one\n tab.\"" -- "\n\n4. \"" +- "\n\n4. " +- "\"" - It looks nice if - " you indent " - "every line of " @@ -7852,8 +7991,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "but here again, " - "Markdown will " - "allow you to be " -- "lazy.\"" -- "\n\n5. \"" +- "lazy.\"\n\n" +- "5. " +- "\"" - "To put a " - "blockquote " - "within a list " @@ -7862,7 +8002,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n " - "delimiters need " - "to be indented.\"" -- "\n\n6. \"" +- "\n\n6. " +- "\"" - "To put a code " - "block within a " - "list item, the " @@ -7870,8 +8011,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " to be\n " - indented twice — - " 8 spaces or two" -- " tabs.\"" -- "\n\n" +- " tabs.\"\n\n" - "These rules " - "specify that a " - "paragraph under " @@ -7919,23 +8059,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "has been called " - "the\n" - "*four-space rule" -- "*." -- "\n\n" +- "*.\n\n" - "The four-space " - "rule is clear " - "and principled, " - "and if the " - "reference\n" -- "implementation `" -- "Markdown.pl` had" -- " followed it, it" -- " probably would " -- "have\n" +- "implementation " +- "`Markdown.pl`" +- " had followed it" +- ", it probably " +- "would have\n" - "become the " - "standard. " -- "However, `" -- "Markdown.pl` " -- "allowed " +- "However, " +- "`Markdown.pl`" +- " allowed " - "paragraphs and\n" - "sublists to " - "start with only " @@ -7972,7 +8111,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "for example, " - "stuck with " -- "Gruber's syntax " +- "Gruber'" +- "s syntax " - "description and " - "the four-space\n" - "rule, while " @@ -7981,8 +8121,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "marked, PHP " - "Markdown, and " - "others\nfollowed " -- "`Markdown.pl`'s " -- "behavior more " +- "`Markdown.pl`'" +- "s behavior more " - "closely.)\n\n" - "Unfortunately, " - "given the " @@ -8007,9 +8147,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " four-space rule" - " or\n" - "the more " -- "forgiving `" -- "Markdown.pl` " -- "behavior, " +- "forgiving " +- "`Markdown.pl`" +- " behavior, " - "provided they " - "are laid out\n" - in a way that is @@ -8070,7 +8210,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with an " - "intervening " - "paragraph,\n\n" -- "``` html\n
      \n" +- "``` html\n" +- "
        \n" - "
      • foo
      • \n" - "
      \n" - "

      bar

      \n
        \n" @@ -8081,9 +8222,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "demands, rather " - "than a single " - "list,\n\n" -- "``` html\n
          \n" -- "
        • \n

          foo

          \n" -- "

          bar

          \n
            \n" +- "``` html\n" +- "
              \n
            • \n" +- "

              foo

              \n" +- "

              bar

              \n
                \n" - "
              • baz
              • \n" - "
              \n
            • \n" - "
            \n```\n\n" @@ -8111,24 +8253,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the initial list - " marker, allows " - "text that is " -- "indented *less " -- "than* the\n" +- "indented " +- "*less than* the\n" - "original list " - "marker to be " - "included in the " - "list item. " - "For example,\n" -- "`Markdown.pl` " -- "parses\n\n" +- "`Markdown.pl`" +- " parses\n\n" - "``` markdown\n" - " - one\n\n two\n" - "```\n\n" - as a single list -- " item, with `two" -- "` a continuation" -- " paragraph:\n\n" -- "``` html\n
              \n" -- "
            • \n

              one

              \n" +- " item, with " +- "`two`" +- " a continuation " +- "paragraph:\n\n" +- "``` html\n" +- "
                \n
              • \n" +- "

                one

                \n" - "

                two

                \n" - "
              • \n
              \n```\n" - "\nand similarly\n" @@ -8172,8 +8316,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with a " - "subparagraph, " - "even though the " -- "paragraph `bar`" -- "\n" +- "paragraph `bar`\n" - "is not indented " - "as far as the " - "first paragraph " @@ -8184,8 +8327,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Arguably this " - "text does read " - like a list item -- " with `bar` as a" -- " subparagraph,\n" +- " with `bar`" +- " as a " +- "subparagraph,\n" - "which may count " - "in favor of the " - "proposal. " @@ -8214,19 +8358,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "parse this text " - "as expected, " - "since the code " -- "block's " -- "indentation is " -- "measured\n" +- "block'" +- s indentation is +- " measured\n" - "from the " -- "beginning of `" -- "foo`." -- "\n\n" +- "beginning of " +- "`foo`.\n\n" - "The one case " - "that needs " - "special " - "treatment is a " -- list item that * -- "starts*\n" +- "list item that " +- "*starts*\n" - "with indented " - "code. " - "How much " @@ -8235,8 +8378,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " case, since\n" - "we don't have a " - "\"first paragraph" -- "\" to measure " -- "from? " +- "\"" +- " to measure from" +- "? " - "Rule #2 simply " - "stipulates\n" - "that in such " @@ -8263,24 +8407,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - diverge in other - " cases.\n\n" - "## Lists\n\n" -- "A [list](@) is a" -- " sequence of one" -- " or more\n" -- "list items [of " -- "the same type]." -- " The list items" -- "\n" +- "A [list](@)" +- " is a sequence " +- "of one or more\n" +- "list items [" +- of the same type +- "]" +- ". " +- "The list items\n" - may be separated - " by any number " - "of blank lines.\n" - "\n" - "Two list items " -- "are [of the same" -- " type](@)\n" +- "are " +- "[of the same " +- "type](@)\n" - "if they begin " -- "with a [list " -- "marker] of the " -- "same type.\n" +- "with a [" +- "list marker]" +- " of the same " +- "type.\n" - Two list markers - " are of the\n" - same type if (a) @@ -8288,15 +8435,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " list markers " - "using the same " - "character\n(`-`, " -- "`+`, or `*`) or " -- "(b) they are " -- "ordered list " -- numbers with the -- " same\n" +- "`+`, or `*`" +- ") or (b) they " +- are ordered list +- " numbers with " +- "the same\n" - delimiter ( -- "either `.` or `)" -- "`)." -- "\n\nA list is an " +- "either `.` or " +- "`)`).\n\n" +- "A list is an " - "[ordered list](@" - ")\n" - "if its " @@ -8310,24 +8457,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - constituent list - "\n" - items begin with -- " [bullet list " +- " [" +- "bullet list " - "markers].\n\n" -- "The [start " -- "number](@)\n" -- "of an [ordered " -- "list] is " -- "determined by " -- "the list number " -- "of\n" +- "The " +- "[start number](@" +- ")\nof an [" +- "ordered list]" +- " is determined " +- "by the list " +- "number of\n" - its initial list - " item. " - "The numbers of " - "subsequent list " - "items are\n" -- disregarded. -- "\n\nA list is " -- "[loose](@) if " -- "any of its " +- "disregarded.\n\n" +- "A list is " +- "[loose](@)" +- " if any of its " - "constituent\n" - "list items are " - "separated by " @@ -8341,8 +8489,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " blank line\n" - "between them. " - Otherwise a list -- " is [tight](@)" -- ".\n" +- " is [tight](@).\n" - "(The difference " - "in HTML output " - "is that " @@ -8423,8 +8570,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The number of " - "doors is 6.\n```\n" - "\nOddly, though, " -- "`Markdown.pl` *" -- "does* allow a " +- "`Markdown.pl` " +- "*does*" +- " allow a " - "blockquote to\n" - "interrupt a " - "paragraph, even " @@ -8452,7 +8600,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```\n\n" - "Second, we are " - "attracted to a\n\n" -- "> [principle of " +- "> " +- "[principle of " - "uniformity](@):" - "\n> " - "if a chunk of " @@ -8468,10 +8617,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "item or " - "blockquote).\n\n" - "(Indeed, the " -- "spec for [list " -- "items] and [" -- "block quotes] " -- "presupposes\n" +- "spec for [" +- "list items] and " +- "[block quotes]" +- " presupposes\n" - this principle.) - " This principle " - "implies that if\n" @@ -8493,10 +8642,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "though the " - "paragraph\n" - "may be rendered " -- "without `

              ` " -- "tags, since the " -- "list is \"tight\")" -- ",\nthen\n\n" +- "without `

              `" +- " tags, since the" +- " list is \"tight\"" +- "),\nthen\n\n" - "``` markdown\n" - "I need to buy\n" - "- new shoes\n" @@ -8518,12 +8667,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "inside list " - "items, the [" - "principle of\n" -- "uniformity] " -- "requires us to " +- "uniformity]" +- " requires us to " - "allow this " - "outside list " - "items as\n" -- "well. ([" +- well. ( +- "[" - reStructuredText - "](https://" - docutils.sourcef @@ -8547,8 +8697,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "hard-wrapped " - "numerals, we " - allow only lists -- " starting with `" -- "1` to\n" +- " starting with " +- "`1` to\n" - "interrupt " - "paragraphs. " - "Thus,\n\n" @@ -8732,9 +8882,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " than\n" - "three spaces of " - "indentation. " -- "Here `- e` is " -- "treated as a " -- "paragraph " +- "Here `- e`" +- " is treated as a" +- " paragraph " - "continuation\n" - "line, because it" - " is indented " @@ -8754,10 +8904,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\nAnd here, " -- "`3. c` is " -- "treated as in " -- "indented code " -- "block,\n" +- "`3. c`" +- " is treated as " +- in indented code +- " block,\n" - "because it is " - "indented four " - "spaces and " @@ -9051,13 +9201,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "literal\n" - "backtick.\n\n\n\n" - "## Code spans\n\n" -- "A [backtick " -- "string](@)\n" +- "A " +- "[backtick string" +- "](@)\n" - "is a string of " - "one or more " - "backtick " -- "characters (`` `" -- " ``) that is " +- characters ( +- "`` ` ``" +- ") that is " - "neither\n" - "preceded nor " - "followed by a " @@ -9079,24 +9231,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "normalized in " - "the\n" - "following ways:" -- "\n\n" -- "- First, [line " -- "endings] are " -- "converted to [" -- "spaces]." -- "\n- " +- "\n\n- First, [" +- "line endings]" +- " are converted " +- "to [spaces].\n" +- "- " - If the resulting - " string both " -- "begins *and* " -- "ends with a [" +- begins *and* +- " ends with a [" - "space]\n " - "character, but " - does not consist - " entirely of [" - "space]\n " - "characters, a " -- "single [space] " -- "character is " +- "single [space]" +- " character is " - removed from the - "\n " - front and back. @@ -9163,8 +9314,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n" -- Note that only * -- "one* space is " +- "Note that only " +- "*one*" +- " space is " - "stripped:\n\n" - "````````````````" - "````````````````" @@ -9192,8 +9344,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\nOnly [spaces]" - ", and not [" - "unicode " -- "whitespace] in " -- "general, are\n" +- "whitespace]" +- " in general, are" +- "\n" - stripped in this - " way:\n\n" - "````````````````" @@ -9220,10 +9373,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/p>\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Line endings] " -- are treated like -- " spaces:\n\n" +- "\n\n\n[Line endings" +- "]" +- " are treated " +- "like spaces:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -9264,16 +9417,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "collapse " - "consecutive " - "spaces\n" -- "when rendering `" -- "` elements" -- ", so it is " -- recommended that -- "\n" +- "when rendering " +- "``" +- " elements, so it" +- " is recommended " +- "that\n" - "the following " - "CSS be used:\n\n" -- " code{white-" -- "space: pre-wrap;" -- "}\n\n\n" +- " " +- "code{white-space" +- ": pre-wrap;}\n\n\n" - "Note that " - "backslash " - "escapes do not " @@ -9296,16 +9449,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "never needed, " - "because one can " - "always choose a\n" -- "string of *n* " -- "backtick " +- string of *n* +- " backtick " - "characters as " - "delimiters, " - "where the code " - "does\n" - "not contain any " - "strings of " -- "exactly *n* " -- "backtick " +- exactly *n* +- " backtick " - "characters.\n\n" - "````````````````" - "````````````````" @@ -9341,9 +9494,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "not parsed as " - "emphasized text," - " since the " -- "second `*` is " -- "part of a code\n" -- "span:\n\n" +- "second `*`" +- " is part of a " +- "code\nspan:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -9470,35 +9623,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Emphasis and " +- "## " +- "Emphasis and " - "strong emphasis\n" - "\nJohn Gruber'" -- "s original [" -- "Markdown syntax\n" -- "description](" -- "https://" +- "s original " +- "[Markdown syntax" +- "\ndescription" +- "](https://" - daringfireball.n - et/projects/ - "markdown/syntax#" -- "em) says:" -- "\n\n> " +- "em) says:\n\n" +- "> " - "Markdown treats " -- "asterisks (`*`) " -- "and underscores " -- "(`_`) as " -- "indicators of\n> " +- "asterisks (`*`" +- ") and " +- "underscores (`_`" +- ") as indicators " +- "of\n> " - "emphasis. " - "Text wrapped " - "with one `*` or " -- "`_` will be " -- "wrapped with an " -- "HTML\n> ``" +- "`_`" +- " will be wrapped" +- " with an HTML\n> " +- "``" - " tag; double `*`" -- "'s or `_`'s will" -- " be wrapped with" -- " an HTML `<" -- "strong>`\n> tag." -- "\n\n" +- "'s or `_`'" +- "s will be " +- "wrapped with an " +- "HTML ``" +- "\n> tag.\n\n" - "This is enough " - "for most users, " - "but these rules " @@ -9508,11 +9664,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "it comes to " - nested emphasis. - " The original\n" -- "`Markdown.pl` " -- test suite makes -- " it clear that " -- "triple `***` and" -- "\n`___`" +- "`Markdown.pl`" +- " test suite " +- "makes it clear " +- "that triple " +- "`***` and\n`___`" - " delimiters can " - "be used for " - "strong emphasis," @@ -9556,8 +9712,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "restricted " - "intraword " - "emphasis to\nthe " -- "`*` forms, to " -- "avoid unwanted " +- "`*`" +- " forms, to avoid" +- " unwanted " - "emphasis in " - words containing - "\n" @@ -9590,44 +9747,48 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[delimiter run](" - "@) is either\n" - "a sequence of " -- "one or more `*` " -- "characters that " -- "is not preceded " -- "or\n" +- "one or more `*`" +- " characters that" +- " is not preceded" +- " or\n" - "followed by a " - non-backslash- -- "escaped `*` " -- "character, or a " -- "sequence\n" -- "of one or more `" -- "_` characters " -- "that is not " -- "preceded or " -- "followed by\n" +- "escaped `*`" +- " character, or a" +- " sequence\n" +- "of one or more " +- "`_`" +- " characters that" +- " is not preceded" +- " or followed by\n" - a non-backslash- -- "escaped `_` " -- "character.\n\n" -- "A [left-flanking" -- " delimiter run](" -- "@) is\na [" -- "delimiter run] " -- "that is (1) not " -- "followed by [" +- "escaped `_`" +- " character.\n\n" +- "A " +- "[left-flanking " +- "delimiter run](@" +- ") is\na [" +- "delimiter run]" +- " that is (1) not" +- " followed by [" - "Unicode " - "whitespace],\n" - "and either (2a) " - "not followed by " -- "a [Unicode " +- "a [" +- "Unicode " - "punctuation " - "character], or\n" - (2b) followed by -- " a [Unicode " +- " a [" +- "Unicode " - "punctuation " - "character] and\n" - "preceded by [" - "Unicode " -- "whitespace] or a" -- " [Unicode " +- "whitespace]" +- " or a [" +- "Unicode " - "punctuation " - "character].\n" - "For purposes of " @@ -9637,28 +9798,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the line count " - "as Unicode " - "whitespace.\n\n" -- "A [right-" -- "flanking " +- "A " +- "[right-flanking " - "delimiter run](@" - ") is\na [" -- "delimiter run] " -- "that is (1) not " -- "preceded by [" +- "delimiter run]" +- " that is (1) not" +- " preceded by [" - "Unicode " - "whitespace],\n" - "and either (2a) " - "not preceded by " -- "a [Unicode " +- "a [" +- "Unicode " - "punctuation " - "character], or\n" - (2b) preceded by -- " a [Unicode " +- " a [" +- "Unicode " - "punctuation " - "character] and\n" - "followed by [" - "Unicode " -- "whitespace] or a" -- " [Unicode " +- "whitespace]" +- " or a [" +- "Unicode " - "punctuation " - "character].\n" - "For purposes of " @@ -9671,38 +9835,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Here are some " - "examples of " - "delimiter runs.\n" -- "\n" -- " - left-" -- flanking but not -- " right-flanking:" -- "\n\n ```\n" -- " ***abc\n " -- " _abc\n " +- "\n - " +- "left-flanking " +- but not right- +- "flanking:\n\n " +- "```\n ***abc\n" +- " _abc\n " - "**\"abc\"\n " - " _\"abc\"\n" - " ```\n\n" - " - right-" - flanking but not - " left-flanking:\n" -- "\n ```\n" -- " abc***\n " +- "\n ```\n " +- " abc***\n " - " abc_\n " - "\"abc\"**\n " - "\"abc\"_\n ```\n\n" - " - Both left " - and right- - "flanking:\n\n " -- "```\n" -- " abc***def\n" -- " \"abc\"_\"def\"\n" +- "```\n " +- " abc***def\n " +- "\"abc\"_\"def\"\n" - " ```\n\n" - " - Neither left" - " nor right-" - "flanking:\n\n " -- "```\n" -- " abc *** def\n" -- " a _ b\n" -- " ```\n\n" +- "```\n " +- "abc *** def\n " +- "a _ b\n ```\n\n" - "(The idea of " - "distinguishing " - "left-flanking " @@ -9714,8 +9876,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " and the " - "character\n" - after comes from -- " Roopesh " -- "Chander's\n" +- " Roopesh Chander" +- "'s\n" - "[vfmd](https://" - web.archive.org/ - web/ @@ -9747,22 +9909,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "emphasis and " - "strong emphasis:" - "\n\n1. " -- "A single `*` " -- "character [can " -- "open emphasis](@" -- ")\n " +- "A single `*`" +- " character " +- "[can open " +- "emphasis](@)\n" +- " " - iff (if and only - " if) it is part " -- "of a [left-" -- "flanking " +- "of a [" +- "left-flanking " - "delimiter run]." - "\n\n2. " -- "A single `_` " -- "character [can " -- "open emphasis] " -- "iff\n " +- "A single `_`" +- " character [" +- "can open " +- "emphasis] iff" +- "\n " - "it is part of a " -- "[left-flanking " +- "[" +- "left-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9771,29 +9936,33 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [right-flanking" -- " delimiter run]" +- " [" +- "right-flanking " +- "delimiter run]" - "\n " - "preceded by a [" - "Unicode " - "punctuation " -- "character]." -- "\n\n3. " -- "A single `*` " -- "character [can " -- "close emphasis](" -- "@)\n " +- "character].\n\n" +- "3. " +- "A single `*`" +- " character " +- "[can close " +- "emphasis](@)\n" +- " " - "iff it is part " -- "of a [right-" -- "flanking " +- "of a [" +- "right-flanking " - "delimiter run]." - "\n\n4. " -- "A single `_` " -- "character [can " -- "close emphasis] " -- "iff\n " +- "A single `_`" +- " character [" +- "can close " +- "emphasis] iff" +- "\n " - "it is part of a " -- "[right-flanking " +- "[" +- "right-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9802,20 +9971,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [left-flanking " +- " [" +- "left-flanking " - "delimiter run]" - "\n " - "followed by a [" - "Unicode " - "punctuation " -- "character]." -- "\n\n5. " -- "A double `**` [" -- "can open strong " -- "emphasis](@)\n" -- " iff it is " -- "part of a [left-" -- "flanking " +- "character].\n\n" +- "5. " +- "A double `**` " +- "[can open strong" +- " emphasis](@)\n" +- " " +- "iff it is part " +- "of a [" +- "left-flanking " - "delimiter run]." - "\n\n6. " - "A double `__` [" @@ -9823,7 +9994,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "emphasis] iff" - "\n " - "it is part of a " -- "[left-flanking " +- "[" +- "left-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9832,20 +10004,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [right-flanking" -- " delimiter run]" +- " [" +- "right-flanking " +- "delimiter run]" - "\n " - "preceded by a [" - "Unicode " - "punctuation " -- "character]." -- "\n\n7. " -- "A double `**` [" -- can close strong -- " emphasis](@)\n" -- " iff it is " -- "part of a [right" -- "-flanking " +- "character].\n\n" +- "7. " +- "A double `**` " +- "[can close " +- "strong emphasis]" +- "(@)\n " +- "iff it is part " +- "of a [" +- "right-flanking " - "delimiter run]." - "\n\n8. " - "A double `__` [" @@ -9853,7 +10027,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " emphasis] iff" - "\n " - "it is part of a " -- "[right-flanking " +- "[" +- "right-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9862,44 +10037,50 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [left-flanking " +- " [" +- "left-flanking " - "delimiter run]" - "\n " - "followed by a [" - "Unicode " - "punctuation " -- "character]." -- "\n\n9. " +- "character].\n\n" +- "9. " - "Emphasis begins " - with a delimiter -- " that [can open " -- "emphasis] and " -- "ends\n " +- " that [" +- "can open " +- "emphasis]" +- " and ends\n " - with a delimiter -- " that [can close" -- " emphasis], and " -- "that uses the " -- "same\n " -- "character (`_` " -- "or `*`) as the " -- "opening " -- "delimiter. The\n" -- " opening and " +- " that [" +- "can close " +- "emphasis]" +- ", and that uses " +- "the same\n " +- "character (`_`" +- " or `*`" +- ) as the opening +- " delimiter. The" +- "\n " +- "opening and " - "closing " - "delimiters must " - "belong to " - "separate\n [" -- "delimiter runs]." -- " If one of the " +- "delimiter runs]" +- ". " +- "If one of the " - "delimiters can " - "both\n " - "open and close " - "emphasis, then " - "the sum of the " - "lengths of the\n" -- " delimiter " -- "runs containing " -- "the opening and " +- " " +- "delimiter runs " +- "containing the " +- "opening and " - "closing " - "delimiters\n " - "must not be a " @@ -9911,25 +10092,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Strong emphasis " - "begins with a " - "delimiter that\n" -- " [can open " -- "strong emphasis]" +- " [" +- "can open strong " +- "emphasis]" - " and ends with a" - " delimiter that\n" -- " [can close " -- "strong emphasis]" +- " [" +- can close strong +- " emphasis]" - ", and that uses " - "the same " - "character\n (" -- "`_` or `*`) as " -- "the opening " -- "delimiter. The\n" -- " opening and " +- "`_` or `*`" +- ) as the opening +- " delimiter. The" +- "\n " +- "opening and " - "closing " - "delimiters must " - "belong to " - "separate\n [" -- "delimiter runs]." -- " If one of the " +- "delimiter runs]" +- ". " +- "If one of the " - "delimiters can " - "both open\n " - and close strong @@ -9946,24 +10131,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "unless both " - "lengths\n " - are multiples of -- " 3.\n\n11. " -- "A literal `*` " -- character cannot -- " occur at the " -- beginning or end -- " of\n `*`" +- " 3.\n\n" +- "11. " +- "A literal `*`" +- " character " +- "cannot occur at " +- the beginning or +- " end of\n `*`" - "-delimited " - "emphasis or `**`" - "-delimited " - "strong emphasis," - " unless it\n " - is backslash- -- "escaped.\n\n12. " -- "A literal `_` " -- character cannot -- " occur at the " -- beginning or end -- " of\n `_`" +- "escaped.\n\n" +- "12. " +- "A literal `_`" +- " character " +- "cannot occur at " +- the beginning or +- " end of\n `_`" - "-delimited " - "emphasis or `__`" - "-delimited " @@ -9979,36 +10166,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the following " - "principles " - "resolve " -- "ambiguity:\n\n13. " +- "ambiguity:\n\n" +- "13. " - "The number of " - "nestings should " - "be minimized. " - "Thus, for " - "example,\n " - "an " -- "interpretation `" -- "...` is " -- always preferred -- " to\n " +- "interpretation " +- "`...`" +- " is always " +- "preferred to\n" +- " " - "`...`." -- "\n\n14. " +- ">`.\n\n" +- "14. " - "An " -- "interpretation `" -- "...<" -- "/strong>` " -- "is always\n " -- "preferred to `<" -- strong>...`." +- "interpretation " +- "`..." +- "`" +- " is always\n " +- "preferred to " +- "`..." +- "`." - "\n\n15. " - "When two " - "potential " - "emphasis or " - "strong emphasis " - "spans overlap,\n" -- " so that the " +- " " +- "so that the " - "second begins " - before the first - " ends and ends " @@ -10019,13 +10209,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Thus, for " - "example,\n " - "`*foo _bar* baz_" -- "` is parsed as `" -- "foo _bar baz_` rather" +- "` is parsed as " +- "`foo _bar baz_` rather" - "\n than " - "`*foo bar* " -- "baz`." -- "\n\n16. " +- "baz`.\n\n" +- "16. " - "When there are " - "two potential " - "emphasis or " @@ -10042,14 +10232,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Thus, for " - "example,\n " - "`**foo **bar baz" -- "**` is parsed as" -- " `**foo " +- "**`" +- " is parsed as " +- "`**foo " - bar baz - "`\n " -- "rather than `<" -- strong>foo **bar -- " baz`." -- "\n\n17. " +- "rather than " +- "`foo **" +- bar baz +- "`.\n\n" +- "17. " - "Inline code " - "spans, links, " - "images, and HTML" @@ -10060,20 +10252,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "is a choice " - "between an " - "interpretation\n" -- " that " -- "contains one of " -- "these elements " -- "and one that " -- "does not, the\n" -- " former " -- "always wins. " +- " " +- "that contains " +- "one of these " +- elements and one +- " that does not, " +- "the\n " +- "former always " +- "wins. " - "Thus, for " -- "example, `*[foo*" -- "](bar)` is\n " -- "parsed as `*foo*<" -- "/a>` rather than" -- " as\n " +- "example, " +- "`*[foo*](bar)`" +- " is\n " +- "parsed as " +- "`*" +- "foo*`" +- " rather than as" +- "\n " - "`[foo](" - "bar)`.\n\n" - "These rules can " @@ -10094,12 +10289,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `*` is " -- "followed by\n" +- "opening `*`" +- " is followed by\n" - "whitespace, and " - "hence not part " -- "of a [left-" -- "flanking " +- "of a [" +- "left-flanking " - "delimiter run]:" - "\n\n" - "````````````````" @@ -10114,8 +10309,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `*` is " -- "preceded\n" +- "opening `*`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10167,9 +10362,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Intraword " -- "emphasis with `*" -- "` is permitted:" -- "\n\n" +- "emphasis with " +- "`*`" +- " is permitted:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10201,8 +10396,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `_` is " -- "followed by\n" +- "opening `_`" +- " is followed by\n" - "whitespace:\n\n" - "````````````````" - "````````````````" @@ -10216,8 +10411,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `_` is " -- "preceded\n" +- "opening `_`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10231,9 +10426,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "Emphasis with `_" -- "` is not allowed" -- " inside words:\n\n" +- "Emphasis with " +- "`_`" +- " is not allowed " +- "inside words:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10318,8 +10514,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "closing `*` is " -- "preceded by\n" +- "closing `*`" +- " is preceded by\n" - "whitespace:\n\n" - "````````````````" - "````````````````" @@ -10345,8 +10541,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "second `*`" -- " is\n" +- "second `*` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10380,8 +10575,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Intraword " -- "emphasis with `*" -- "` is allowed:\n\n" +- "emphasis with " +- "`*` is allowed:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10395,8 +10591,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "closing `_` is " -- "preceded by\n" +- "closing `_`" +- " is preceded by\n" - "whitespace:\n\n" - "````````````````" - "````````````````" @@ -10410,8 +10606,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "second `_`" -- " is\n" +- "second `_` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10438,8 +10633,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "Intraword " - "emphasis is " -- "disallowed for `" -- "_`:\n\n" +- "disallowed for " +- "`_`:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10517,8 +10712,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "strong emphasis," - " because the " -- "opening `**` is " -- "preceded\n" +- "opening `**`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10538,9 +10733,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - Intraword strong -- " emphasis with `" -- "**` is permitted" -- ":\n\n" +- " emphasis with " +- "`**`" +- " is permitted:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10593,8 +10788,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "strong emphasis," - " because the " -- "opening `__` is " -- "preceded\n" +- "opening `__`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10610,8 +10805,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - Intraword strong - " emphasis is " -- "forbidden with `" -- "__`:\n\n" +- "forbidden with " +- "`__`:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10691,16 +10886,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "(Nor can it be " - "interpreted as " -- "an emphasized `*" -- "foo bar *`, " -- "because of\n" -- Rule 11.) -- "\n\n" +- "an emphasized " +- "`*foo bar *`" +- ", because of\n" +- "Rule 11.)\n\n" - "This is not " - "strong emphasis," - " because the " -- "second `**`" -- " is\n" +- "second `**` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10794,8 +10987,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "strong emphasis," - " because the " -- "second `__`" -- " is\n" +- "second `__` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10827,8 +11019,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - Intraword strong - " emphasis is " -- "forbidden with `" -- "__`:\n\n" +- "forbidden with " +- "`__`:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10995,8 +11187,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " that\n" - "can both open " - "and close (like " -- "the `*` after `" -- "foo`)\n" +- "the `*` after " +- "`foo`)\n" - "cannot form " - "emphasis if the " - "sum of the " @@ -11012,8 +11204,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " multiples of 3." - "\n\n\n" - "For the same " -- "reason, we don't" -- " get two " +- "reason, we don'" +- "t get two " - "consecutive\n" - "emphasis " - sections in this @@ -11076,9 +11268,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " closing and " - "opening\n" - "delimiter runs " -- "are *both* " -- "multiples of 3, " -- "though,\n" +- are *both* +- " multiples of 3," +- " though,\n" - "they can match " - "to create " - "emphasis:\n\n" @@ -11407,9 +11599,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ", Rule 11 " - "determines\n" - "that the excess " -- "literal `*` " -- "characters will " -- "appear outside " +- "literal `*`" +- " characters will" +- " appear outside " - "of the\n" - "emphasis, rather" - " than inside it:" @@ -11537,9 +11729,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ", Rule 12 " - "determines\n" - "that the excess " -- "literal `_` " -- "characters will " -- "appear outside " +- "literal `_`" +- " characters will" +- " appear outside " - "of the\n" - "emphasis, rather" - " than inside it:" @@ -11853,10 +12045,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n\n" - "## Links\n\n" - "A link contains " -- "[link text] (the" -- " visible text), " -- "a [link " -- "destination]\n" +- "[link text]" +- " (the visible " +- "text), a [" +- link destination +- "]\n" - (the URI that is - " the link " - "destination), " @@ -11866,8 +12059,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "basic kinds of " - "links in " - "Markdown. In [" -- "inline links] " -- "the\n" +- "inline links]" +- " the\n" - "destination and " - "title are given " - "immediately " @@ -11878,10 +12071,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " and title are " - "defined " - "elsewhere in\n" -- the document. -- "\n\nA " -- "[link text](@) " -- "consists of a " +- "the document.\n\n" +- "A [link text](@)" +- " consists of a " - sequence of zero - " or more\n" - "inline elements " @@ -11891,7 +12083,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "). The\n" - "following rules " - "apply:\n\n" -- "- Links may not " +- "- " +- "Links may not " - "contain other " - "links, at any " - level of nesting @@ -11903,41 +12096,46 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "inside each\n " - "other, the inner" - "-most definition" -- " is used.\n\n- " +- " is used.\n\n" +- "- " - "Brackets are " - "allowed in the [" -- "link text] only " -- "if (a) they\n " +- "link text]" +- " only if (a) " +- "they\n " - are backslash- - "escaped or (b) " - they appear as a - " matched pair of" - " brackets,\n " - "with an open " -- "bracket `[`, a " -- sequence of zero -- " or more inlines" -- ", and\n " +- "bracket `[`" +- ", a sequence of " +- "zero or more " +- "inlines, and\n " - "a close bracket " - "`]`.\n\n" -- "- Backtick [code" -- " spans], [" -- "autolinks], and " -- "raw [HTML tags] " -- "bind more " +- "- " +- "Backtick [" +- "code spans], [" +- "autolinks]" +- ", and raw [" +- "HTML tags]" +- " bind more " - "tightly\n " - "than the " - brackets in link - " text. " - "Thus, for " - "example,\n " -- "`` [foo`]` `` " -- "could not be a " +- "`` [foo`]` ``" +- " could not be a " - "link text, since" - " the second `]`" - "\n " - "is part of a " -- "code span.\n\n- " +- "code span.\n\n" +- "- " - "The brackets in " - "link text bind " - "more tightly " @@ -11947,37 +12145,41 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "strong emphasis]" - ". " - "Thus, for " -- "example, `*[foo*" -- "](url)` is a " -- "link.\n\n" -- "A [link " -- "destination](@) " -- "consists of " -- "either\n\n- " +- "example, " +- "`*[foo*](url)`" +- " is a link.\n\n" +- "A " +- "[link " +- "destination](@)" +- " consists of " +- "either\n\n" +- "- " - "a sequence of " - "zero or more " - "characters " - "between an " -- "opening `<` and " -- "a\n closing `>`" +- "opening `<`" +- " and a\n " +- "closing `>`" - " that contains " - "no line endings " - "or unescaped\n " -- "`<` or `>` " -- "characters, or\n\n" -- "- a nonempty " +- "`<` or `>`" +- " characters, or" +- "\n\n- " +- "a nonempty " - "sequence of " - "characters that " - "does not start " -- "with `<`" -- ",\n " +- "with `<`,\n " - does not include -- " [ASCII control " +- " [" +- "ASCII control " - "characters][" - "ASCII control " - "character]\n or " -- "[space] " -- "character, and " +- "[space]" +- " character, and " - "includes " - parentheses only - " if (a) they are" @@ -12002,61 +12204,69 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "nesting\n " - "should be " - "supported.)\n\n" -- "A [link title](@" -- ") consists of " -- "either\n\n- " +- "A " +- "[link title](@)" +- " consists of " +- "either\n\n" +- "- " - "a sequence of " - "zero or more " - "characters " - between straight - " double-quote\n " -- "characters (`\"`)" -- ", including a `\"" -- "` character only" -- " if it is\n " +- "characters (`\"`" +- "), including a " +- "`\"`" +- " character only " +- "if it is\n " - backslash- -- "escaped, or\n\n- " +- "escaped, or\n\n" +- "- " - "a sequence of " - "zero or more " - "characters " - between straight - " single-quote\n " -- "characters (`'`)" -- ", including a `'" -- "` character only" -- " if it is\n " +- "characters (`'`" +- "), including a " +- "`'`" +- " character only " +- "if it is\n " - backslash- -- "escaped, or\n\n- " +- "escaped, or\n\n" +- "- " - "a sequence of " - "zero or more " - "characters " - between matching - " parentheses\n (" -- "`(...)`), " -- "including a `(` " -- "or `)` character" -- " only if it is\n" -- " backslash-" +- "`(...)`" +- "), including a " +- "`(` or `)`" +- " character only " +- "if it is\n " +- backslash- - "escaped.\n\n" -- "Although [link " -- "titles] may span" -- " multiple lines," -- " they may not " +- "Although [" +- "link titles]" +- " may span " +- "multiple lines, " +- "they may not " - "contain\na [" -- "blank line]." -- "\n\nAn " +- "blank line].\n\n" +- "An " - "[inline link](@)" - " consists of a [" -- "link text] " -- "followed " +- "link text]" +- " followed " - "immediately\n" - "by a left " -- "parenthesis `(`," -- " an optional [" +- "parenthesis `(`" +- ", an optional [" - link destination - "], an optional\n[" -- "link title], and" -- " a right " +- "link title]" +- ", and a right " - "parenthesis `)`." - "\n" - "These four " @@ -12065,41 +12275,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces, tabs, " - "and up to one " - "line\nending.\n" -- "If both [link " -- "destination] and" -- " [link title] " -- "are present, " -- they *must* -- " be\n" +- "If both [" +- link destination +- "] and [" +- "link title]" +- " are present, " +- "they *must* be\n" - "separated by " - "spaces, tabs, " - "and up to one " - "line ending.\n\n" -- "The link's text " -- "consists of the " -- "inlines " +- "The link'" +- "s text consists " +- "of the inlines " - "contained\n" -- "in the [link " -- "text] (excluding" -- " the enclosing " -- square brackets) -- ".\nThe link'" +- "in the [" +- "link text]" +- " (excluding the " +- enclosing square +- " brackets).\n" +- "The link'" - "s URI consists " - "of the link " - "destination, " - "excluding " - "enclosing\n" -- "`<...>` if " -- "present, with " -- backslash- +- "`<...>`" +- " if present, " +- with backslash- - "escapes in " - "effect as " - "described\n" - above. The link -- "'s title " -- "consists of the " -- "link title, " -- "excluding its\n" +- "'" +- s title consists +- " of the link " +- "title, excluding" +- " its\n" - "enclosing " - "delimiters, with" - " backslash-" @@ -12227,8 +12439,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n" - "The destination " -- "can contain `)` " -- "if it is " +- "can contain `)`" +- " if it is " - "enclosed\n" - "in pointy " - "brackets:\n\n" @@ -12531,11 +12743,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces, tabs, " - "and up to one " - "line\nending.\n" -- "Other [Unicode " -- "whitespace] like" -- " non-breaking " -- "space doesn't " -- "work.\n\n" +- "Other [" +- "Unicode " +- "whitespace]" +- " like non-" +- "breaking space " +- "doesn't work.\n\n" - "````````````````" - "````````````````" - " example\n" @@ -12584,10 +12797,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n(Note: " -- "`Markdown.pl` " -- did allow double -- " quotes inside a" -- " double-quoted\n" +- "`Markdown.pl`" +- " did allow " +- "double quotes " +- inside a double- +- "quoted\n" - "title, and its " - "test suite " - "included a test " @@ -12601,8 +12815,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "brings, since " - "there are " - "already many " -- ways---backslash -- " escaping,\n" +- ways--- +- "backslash " +- "escaping,\n" - "entity and " - "numeric " - "character " @@ -12611,12 +12826,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "different\n" - "quote type for " - "the enclosing " -- title---to write -- " titles " +- title--- +- "to write titles " - "containing\n" - "double quotes. " -- "`Markdown.pl`'s " -- "handling of " +- "`Markdown.pl`'" +- "s handling of " - "titles has a " - "number\n" - of other strange @@ -12633,10 +12848,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "links, it allows" - " a title to " - "begin\nwith `\"`" -- " and end with `)" -- "`. " -- "`Markdown.pl` " -- "1.0.1 even " +- " and end with " +- "`)`. " +- "`Markdown.pl`" +- " 1.0.1 even " - "allows\n" - "titles with no " - "closing " @@ -12838,11 +13053,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Note that " -- brackets that * -- "aren't* part of " -- "links do not " -- "take\nprecedence:" -- "\n\n" +- "brackets that " +- "*aren't*" +- " part of links " +- "do not take\n" +- "precedence:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -12900,12 +13115,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "There are three " -- "kinds of [" -- "reference link](" -- "@)s:\n" +- "kinds of " +- "[reference link]" +- "(@)s:\n" - "[full](#full-" -- "reference-link)," -- " [collapsed](#" +- reference-link) +- ", " +- "[collapsed](#" - collapsed- - "reference-link)," - "\nand " @@ -12916,22 +13132,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[full reference " - "link](@)\n" - "consists of a [" -- "link text] " -- "immediately " +- "link text]" +- " immediately " - "followed by a [" - "link label]\n" -- "that [matches] a" -- " [link reference" -- " definition] " -- elsewhere in the -- " document.\n\n" -- "A [link label](@" -- ) begins with a -- " left bracket (`" -- "[`) and ends\n" +- "that [matches]" +- " a [" +- "link reference " +- "definition]" +- " elsewhere in " +- "the document.\n\n" +- "A " +- "[link label](@)" +- " begins with a " +- left bracket ( +- "`[`) and ends\n" - "with the first " -- "right bracket (`" -- "]`) that is not " +- right bracket ( +- "`]`" +- ") that is not " - backslash- - "escaped.\n" - "Between these " @@ -12958,8 +13177,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "inside the " - "square\nbrackets." - "\n\nOne label " -- "[matches](@)" -- "\n" +- "[matches](@)\n" - "another just in " - "case their " - normalized forms @@ -12969,11 +13187,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " the opening and" - " closing " - "brackets,\n" -- perform the * -- "Unicode case " -- "fold*, strip " -- "leading and " -- "trailing\n" +- "perform the " +- "*Unicode case " +- fold* +- ", strip leading " +- "and trailing\n" - "spaces, tabs, " - and line endings - ", and collapse " @@ -12996,10 +13214,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " in such cases " - "to emit a " - "warning.)\n\n" -- "The link's URI " -- "and title are " -- "provided by the " -- "matching [link\n" +- "The link'" +- "s URI and title " +- "are provided by " +- "the matching [" +- "link\n" - "reference " - "definition].\n\n" - Here is a simple @@ -13017,9 +13236,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "The rules for " -- "the [link text] " -- "are the same as " -- "with\n[" +- "the [link text]" +- " are the same as" +- " with\n[" - "inline links]" - ". Thus:\n\n" - "The link text " @@ -13122,10 +13341,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - (In the examples - " above, we have " -- "two [shortcut " +- "two [" +- "shortcut " - "reference links]" - "\ninstead of one " -- "[full reference " +- "[" +- "full reference " - "link].)\n\n" - "The following " - cases illustrate @@ -13255,9 +13476,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " or line endings" - " are allowed " - "between the [" -- "link text] and " -- "the\n[link label]" -- ":\n\n" +- "link text]" +- " and the\n[" +- "link label]:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13286,8 +13507,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "This is a " - "departure from " -- "John Gruber's " -- "original " +- "John Gruber'" +- "s original " - "Markdown syntax\n" - "description, " - which explicitly @@ -13300,10 +13521,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "It brings " - "reference links " - "in line with\n[" -- "inline links], " -- which (according -- " to both " -- "original " +- "inline links]" +- ", which (" +- "according to " +- "both original " - "Markdown and\n" - "this spec) " - "cannot have " @@ -13335,15 +13556,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo]\n[bar]\n\n" - "[foo]: /url1\n" - "[bar]: /url2\n" -- "```\n\n(Note that " -- "[shortcut " +- "```\n\n" +- "(Note that [" +- "shortcut " - "reference links]" - " were introduced" - " by Gruber\n" - "himself in a " - "beta version of " -- "`Markdown.pl`, " -- "but never " +- "`Markdown.pl`" +- ", but never " - "included\n" - "in the official " - "syntax " @@ -13368,8 +13590,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "results.)\n\n" - "When there are " - "multiple " -- "matching [link " -- "reference " +- "matching [" +- "link reference " - "definitions],\n" - "the first is " - "used:\n\n" @@ -13479,8 +13701,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\nA [link label" -- "] must contain " -- "at least one " +- "]" +- " must contain at" +- " least one " - "character that " - "is not a space, " - "tab, or\n" @@ -13508,23 +13731,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "reference link](" - "@)\n" - "consists of a [" -- "link label] that" -- " [matches] a\n[" +- "link label]" +- " that [matches]" +- " a\n[" - "link reference " -- "definition] " -- elsewhere in the -- "\n" +- "definition]" +- " elsewhere in " +- "the\n" - "document, " - "followed by the " -- "string `[]`" -- ".\n" +- "string `[]`.\n" - "The contents of " - "the link label " - "are parsed as " - "inlines,\n" - "which are used " -- "as the link's " -- "text. The link'" +- "as the link'" +- "s text. " +- "The link'" - "s URI and title " - "are\n" - "provided by the " @@ -13599,31 +13823,34 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "reference link](" - "@)\n" - "consists of a [" -- "link label] that" -- " [matches] a\n[" +- "link label]" +- " that [matches]" +- " a\n[" - "link reference " -- "definition] " -- elsewhere in the -- "\n" +- "definition]" +- " elsewhere in " +- "the\n" - "document and is " - "not followed by " -- "`[]` or a link " -- "label.\n" +- "`[]`" +- " or a link label" +- ".\n" - "The contents of " - "the link label " - "are parsed as " - "inlines,\n" - "which are used " -- "as the link's " -- "text. The link'" +- "as the link'" +- "s text. " +- "The link'" - "s URI and title\n" - "are provided by " - "the matching " - "link reference " - "definition.\n" -- "Thus, `[foo]` is" -- " equivalent to `" -- "[foo][]`.\n\n" +- "Thus, `[foo]`" +- " is equivalent " +- "to `[foo][]`.\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13796,11 +14023,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n" - In the following -- " case `[bar][baz" -- "]` is parsed as " -- "a reference,\n" -- "`[foo]` as " -- "normal text:\n\n" +- " case " +- "`[bar][baz]`" +- " is parsed as a " +- "reference,\n" +- "`[foo]`" +- " as normal text:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13812,9 +14041,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "Here, though, `[" -- "foo][bar]` is " -- "parsed as a " +- "Here, though, " +- "`[foo][bar]`" +- " is parsed as a " - "reference, since" - "\n`[bar]`" - " is defined:\n\n" @@ -13837,9 +14066,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "because it\n" - is followed by a - " link label (" -- "even though `[" -- "bar]` is not " -- "defined):\n\n" +- "even though " +- "`[bar]`" +- " is not defined)" +- ":\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13858,17 +14088,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the syntax for " - "links, with one\n" - "difference. " -- "Instead of [link" -- " text], we have " -- "an\n" +- "Instead of [" +- "link text]" +- ", we have an\n" - "[image " -- "description](@)." -- " The rules for " +- "description](@)" +- ". " +- "The rules for " - "this are the\n" - "same as for [" -- "link text], " -- "except that (a) " -- "an\n" +- "link text]" +- ", except that (a" +- ") an\n" - "image " - "description " - "starts with `![`" @@ -13886,8 +14117,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "HTML,\n" - "this is " - "standardly used " -- "as the image's `" -- "alt` attribute." +- "as the image's " +- "`alt` attribute." - "\n\n" - "````````````````" - "````````````````" @@ -13952,14 +14183,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "plain string " - "content\nof the [" - "image " -- "description] be " -- "used. " +- "description]" +- " be used. " - "Note that in\n" - "the above " - "example, the alt" -- " attribute's " -- "value is `foo " -- "bar`, not " +- " attribute'" +- "s value is " +- "`foo bar`, not " - "`foo\n" - "[bar](/url)` or " - "`foo `" @@ -14249,51 +14480,51 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " address\n" - "as the link " - "label.\n\n" -- "A [URI autolink]" -- "(@) consists of " -- "`<`, followed by" -- " an\n[" -- "absolute URI] " -- "followed by `>`." -- " It is parsed " -- "as\n" +- "A " +- "[URI autolink](@" +- ") consists of " +- "`<`" +- ", followed by an" +- "\n[absolute URI]" +- " followed by `>`" +- ". " +- "It is parsed as\n" - "a link to the " - "URI, with the " -- "URI as the " -- "link's label." -- "\n\nAn " +- "URI as the link'" +- "s label.\n\n" +- "An " - "[absolute URI](@" - "),\n" - "for these " - "purposes, " - "consists of a [" -- "scheme] followed" -- " by a colon (`:`" -- ")\n" +- "scheme]" +- " followed by a " +- "colon (`:`)\n" - followed by zero - " or more " - characters other -- " than [ASCII " -- "control\n" +- " than [" +- "ASCII control\n" - "characters][" - "ASCII control " - "character], [" -- "space], `<`, and" -- " `>`" -- ".\n" +- "space], `<`" +- ", and `>`.\n" - "If the URI " - "includes these " - "characters, they" - " must be percent" - "-encoded\n(e.g. " -- "`%20` for a " -- "space).\n\n" +- "`%20`" +- " for a space).\n\n" - "For purposes of " -- "this spec, a [" -- "scheme](@) is " -- "any sequence\n" -- "of 2--32 " -- "characters " +- "this spec, a " +- "[scheme](@)" +- " is any sequence" +- "\nof 2--" +- "32 characters " - "beginning with " - "an ASCII letter " - "and followed\n" @@ -14302,9 +14533,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "ASCII letters, " - "digits, or the " - "symbols plus\n(\"+" -- "\"), period (\".\")" -- ", or hyphen (\"-\"" -- ").\n\n" +- "\"), period (\".\"" +- "), or hyphen (\"-" +- "\").\n\n" - "Here are some " - "valid autolinks:" - "\n\n" @@ -14375,8 +14606,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Note that many " - "strings that " - "count as [" -- "absolute URIs] " -- "for\n" +- "absolute URIs]" +- " for\n" - purposes of this - " spec are not " - "valid URIs, " @@ -14469,24 +14700,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nAn " - "[email autolink]" - "(@)\nconsists of " -- "`<`, followed by" -- " an [email " -- "address],\n" -- "followed by `>`." -- " The link's " -- "label is the " +- "`<`" +- ", followed by an" +- " [email address]" +- ",\nfollowed by " +- "`>`. The link'" +- "s label is the " - "email address,\n" -- "and the URL is `" -- "mailto:` " -- "followed by the " -- "email address.\n\n" -- "An [email " -- "address](@),\n" +- "and the URL is " +- "`mailto:`" +- " followed by the" +- " email address.\n" +- "\nAn " +- "[email address](" +- "@),\n" - "for these " - "purposes, is " - "anything that " - "matches\nthe " -- "[non-normative " +- "[" +- "non-normative " - "regex from the " - "HTML5\nspec" - "](https://" @@ -14620,9 +14853,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "## Raw HTML\n\n" - "Text between `<`" -- " and `>` that " -- "looks like an " -- "HTML tag is " +- " and `>`" +- " that looks like" +- " an HTML tag is " - "parsed as a\n" - raw HTML tag and - " will be " @@ -14641,16 +14874,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Here is the " - grammar for tags - ":\n\n" -- "A [tag name](@) " -- "consists of an " +- "A [tag name](@)" +- " consists of an " - "ASCII letter\n" - followed by zero - " or more ASCII " - "letters, digits," - " or\nhyphens (`-`" - ").\n\n" -- "An [attribute](@" -- ") consists of " +- "An " +- "[attribute](@)" +- " consists of " - "spaces, tabs, " - "and up to one " - "line ending,\nan " @@ -14663,13 +14897,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[attribute name]" - "(@)\n" - "consists of an " -- "ASCII letter, `_" -- "`, or `:`, " -- followed by zero -- " or more ASCII\n" +- "ASCII letter, " +- "`_`, or `:`" +- ", followed by " +- "zero or more " +- "ASCII\n" - "letters, digits," -- " `_`, `.`, `:`, " -- "or `-`" +- " `_`, `.`, `:`" +- ", or `-`" - ". " - "(Note: This is " - "the XML\n" @@ -14693,18 +14928,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ",\nand an [" - "attribute value]" - ".\n\n" -- "An [attribute " -- "value](@)\n" +- "An " +- "[attribute value" +- "](@)\n" - "consists of an [" - "unquoted " - "attribute value]" - ",\na [" - "single-quoted " - "attribute value]" -- ", or a [double-" -- quoted attribute -- " value].\n\n" -- "An [unquoted " +- ", or a [" +- "double-quoted " +- "attribute value]" +- ".\n\n" +- "An " +- "[unquoted " - "attribute value]" - "(@)\n" - "is a nonempty " @@ -14712,60 +14950,66 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters not\n" - including spaces - ", tabs, line " -- "endings, `\"`, `'" -- "`, `=`, `<`, `>`" -- ", or `` ` ``.\n\n" -- "A [single-quoted" -- " attribute value" -- "](@)\n" -- "consists of `'`," -- " zero or more\n" -- "characters not " -- "including `'`, " -- "and a final `'`." +- "endings, `\"`, " +- "`'`, `=`, `<`, " +- "`>`, or `` ` ``." - "\n\nA " +- "[single-quoted " +- "attribute value]" +- "(@)\nconsists of " +- "`'`" +- ", zero or more\n" +- "characters not " +- "including `'`" +- ", and a final " +- "`'`.\n\n" +- "A " - "[double-quoted " - "attribute value]" - "(@)\nconsists of " -- "`\"`, zero or " -- "more\n" +- "`\"`" +- ", zero or more\n" - "characters not " -- "including `\"`, " -- "and a final `\"`." -- "\n\nAn " -- "[open tag](@) " -- "consists of a `<" -- "` character, a [" +- "including `\"`" +- ", and a final " +- "`\"`.\n\n" +- "An [open tag](@)" +- " consists of a " +- "`<`" +- " character, a [" - "tag name],\n" - "zero or more [" -- "attributes], " -- "optional spaces," -- " tabs, and up to" -- " one line ending" -- ",\nan optional " -- "`/` character, " -- "and a `>` " -- "character.\n\n" -- "A [closing tag](" -- "@) consists of " -- "the string `` character." +- "\n\nA " +- "[closing tag](@)" +- " consists of the" +- " string ``.\n\n" -- "An [HTML comment" -- "](@) consists of" -- " ``, ``, " +- "``, or " +- "``, " -- "and `-->` (see " -- "the\n" +- "string `-->`" +- ", and `-->`" +- " (see the\n" - "[HTML spec](" - "https://" - html.spec.whatwg @@ -14774,7 +15018,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - markup- - declaration-open - "-state)).\n\n" -- "A [processing " +- "A " +- "[processing " - "instruction](@)\n" - "consists of the " - "string ``" - ", and the string" - "\n`?>`.\n\n" -- "A [declaration](" -- "@) consists of " -- "the string ``, " -- "and the " +- "character `>`" +- ", and the " - "character `>`.\n\n" -- "A [CDATA section" -- "](@) consists of" -- "\nthe string " +- "A " +- "[CDATA section](" +- "@) consists of\n" +- "the string " - "`\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Closing tags:\n" -- "\n" +- "\n\n\nClosing tags:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15025,9 +15273,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">

              \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Declarations:\n" -- "\n" +- "\n\n\nDeclarations:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15093,8 +15340,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Hard line " -- "breaks\n\n" +- "## " +- Hard line breaks +- "\n\n" - A line ending ( - "not in a code " - span or HTML tag @@ -15104,11 +15352,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces and does " - not occur at the - " end of a block\n" -- "is parsed as a [" -- "hard line break]" -- "(@) (rendered\n" -- "in HTML as a `<" -- "br />` tag):\n\n" +- "is parsed as a " +- "[hard line break" +- "](@) (rendered\n" +- "in HTML as a " +- "`
              ` tag):\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15123,10 +15371,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "alternative, a " - backslash before - " the\n[" -- "line ending] may" -- " be used instead" -- " of two or more " -- "spaces:\n\n" +- "line ending]" +- " may be used " +- "instead of two " +- "or more spaces:\n" +- "\n" - "````````````````" - "````````````````" - " example\n" @@ -15219,9 +15468,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "span

              \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "or HTML tags:\n" -- "\n" +- "\n\n\nor HTML tags:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15286,8 +15534,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Soft line " -- "breaks\n\n" +- "## " +- Soft line breaks +- "\n\n" - "A regular line " - ending (not in a - " code span or " @@ -15297,20 +15546,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "or more spaces " - "or a backslash " - "is parsed as a\n" -- "[softbreak](@)." -- " (A soft line " +- "[softbreak](@)" +- ". " +- "(A soft line " - "break may be " - rendered in HTML - " either as a\n[" -- "line ending] or " -- "as a space. " +- "line ending]" +- " or as a space. " - "The result will " - "be the same in\n" - "browsers. " - "In the examples " -- "here, a [line " -- "ending] will be " -- "used.)\n\n" +- "here, a [" +- "line ending]" +- " will be used.)" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15347,8 +15598,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "breaks\n" - "as hard line " - "breaks.\n\n" -- "## Textual " -- "content\n\n" +- "## " +- Textual content +- "\n\n" - "Any characters " - "not given an " - "interpretation " @@ -15390,7 +15642,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "`." @@ -317,21 +317,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\nIn type 7 blocks, the [tag name] can be anything:" - "```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````" - "```````````````````````````````` example\n\n*bar*\n\n.\n\n*bar*\n\n````````````````````````````````\n\n\n```````````````````````````````` example\n\n*bar*\n.\n\n*bar*\n````````````````````````````````" -- "These rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n``" -- " tags in three different ways. In this case, we get a raw\nHTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" +- "These rules are designed to allow us to work with tags that\ncan function as either block-level or inline-level tags.\nThe `` tag is a nice example. We can surround content with\n`` tags in three different ways. In this case, we get a raw" +- "HTML block, because the `` tag is on a line by itself:\n\n```````````````````````````````` example\n\n*foo*\n\n.\n\n*foo*\n\n````````````````````````````````" - "In this case, we get a raw HTML block that just includes\nthe `` tag (because it ends with the following blank\nline). So the contents get interpreted as CommonMark:" - "```````````````````````````````` example\n\n\n*foo*\n\n\n.\n\n

              foo

              \n
              \n````````````````````````````````" - "Finally, in this case, the `` tags are interpreted\nas [raw HTML] *inside* the CommonMark paragraph. (Because\nthe tag is not on a line by itself, we get inline HTML\nrather than an [HTML block].)" - "```````````````````````````````` example\n*foo*\n.\n

              foo

              \n````````````````````````````````" -- "HTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`" -- "), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks\nend at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:" -- "A pre tag (type 1):" -- "```````````````````````````````` example\n
              \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
              \nokay\n.\n
              \nimport Text.HTML.TagSoup\n\nmain :: IO ()"
              -- "main = print $ parseTags tags\n
              \n

              okay

              \n````````````````````````````````\n\n\nA script tag (type 1):" -- "```````````````````````````````` example\n\nokay\n.\n\n

              okay

              \n````````````````````````````````\n\n\nA textarea tag (type 1):" -- "```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\nA style tag (type 1):" -- "```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

              okay

              \n````````````````````````````````" +- "HTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks" +- "end at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\nA pre tag (type 1):" +- "```````````````````````````````` example" +- "
              \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
              \nokay\n.\n
              \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
              \n

              okay

              " +- "````````````````````````````````\n\n\nA script tag (type 1):" +- "```````````````````````````````` example" +- "\nokay\n.\n\n

              okay

              \n````````````````````````````````\n\n\nA textarea tag (type 1):\n\n```````````````````````````````` example\n\n.\n\n````````````````````````````````" +- "A style tag (type 1):\n\n```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

              okay

              \n````````````````````````````````" - "If there is no matching end tag, the block will end at the\nend of the document (or the enclosing [block quote][block quotes]\nor [list item][list items]):" - "```````````````````````````````` example\n\n\nfoo\n.\n\n\nfoo\n````````````````````````````````" - "```````````````````````````````` example\n>
              \n> foo\n\nbar\n.\n
              \n
              \nfoo\n
              \n

              bar

              \n````````````````````````````````" @@ -342,8 +342,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n\nokay\n.\n\n

              okay

              \n````````````````````````````````\n\n\n\nA processing instruction (type 3):" - "```````````````````````````````` example\n';\n\n?>\nokay\n.\n';\n\n?>\n

              okay

              \n````````````````````````````````\n\n\nA declaration (type 4):" - "```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\n\nCDATA (type 5):" -- "```````````````````````````````` example\n\nokay\n.\n\n

              okay

              \n````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:" +- "```````````````````````````````` example" +- "\nokay\n.\n\n

              okay

              " +- "````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:" - "```````````````````````````````` example\n \n\n \n.\n \n
              <!-- foo -->\n
              \n````````````````````````````````" - "```````````````````````````````` example\n
              \n\n
              \n.\n
              \n
              <div>\n
              \n````````````````````````````````\n\n\nAn HTML block of types 1--6 can interrupt a paragraph, and need not be\npreceded by a blank line." - "```````````````````````````````` example\nFoo\n
              \nbar\n
              \n.\n

              Foo

              \n
              \nbar\n
              \n````````````````````````````````" @@ -360,17 +361,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "and flexible way of including Markdown content inside HTML tags:\nsimply separate the Markdown from the HTML using blank lines:\n\nCompare:" - "```````````````````````````````` example\n
              \n\n*Emphasized* text.\n\n
              \n.\n
              \n

              Emphasized text.

              \n
              \n````````````````````````````````" - "```````````````````````````````` example\n
              \n*Emphasized* text.\n
              \n.\n
              \n*Emphasized* text.\n
              \n````````````````````````````````" -- "Some Markdown implementations have adopted a convention of\ninterpreting content inside tags as text if the open tag has\nthe attribute `markdown=1`" -- ". The rule given above seems a simpler and\nmore elegant way of achieving the same expressive power, which is also\nmuch simpler to parse." +- "Some Markdown implementations have adopted a convention of\ninterpreting content inside tags as text if the open tag has\nthe attribute `markdown=1`. The rule given above seems a simpler and" +- "more elegant way of achieving the same expressive power, which is also\nmuch simpler to parse." - "The main potential drawback is that one can no longer paste HTML\nblocks into Markdown documents with 100% reliability. However,\n*in most cases* this will work fine, because the blank lines in\nHTML are usually followed by HTML block tags. For example:" - "```````````````````````````````` example\n
    \n\n\n\n\n\n\n\n
    \nHi\n
    \n.\n\n\n\n\n
    \nHi\n
    \n````````````````````````````````" - "There are problems, however, if the inner tags are indented\n*and* separated by spaces, as then they will be interpreted as\nan indented code block:" - "```````````````````````````````` example\n\n\n \n\n \n\n \n\n
    \n Hi\n
    \n.\n\n \n
    <td>\n  Hi\n</td>\n
    \n \n
    \n````````````````````````````````" - "Fortunately, blank lines are usually not necessary and can be\ndeleted. The exception is inside `
    ` tags, but as described\n[above][HTML blocks], raw HTML blocks starting with `
    `\n*can* contain blank lines."
     - "## Link reference definitions"
    -- "A [link reference definition](@)\nconsists of a [link label], optionally preceded by up to three spaces of\nindentation, followed\nby a colon (`:`"
    -- "), optional spaces or tabs (including up to one\n[line ending]), a [link destination],\noptional spaces or tabs (including up to one\n[line ending]), and an optional [link\ntitle], which if it is present must be separated\nfrom the [link destination]"
    -- " by spaces or tabs.\nNo further character may occur."
    +- "A [link reference definition](@)\nconsists of a [link label], optionally preceded by up to three spaces of\nindentation, followed\nby a colon (`:`), optional spaces or tabs (including up to one\n[line ending]), a [link destination],"
    +- "optional spaces or tabs (including up to one\n[line ending]), and an optional [link\ntitle], which if it is present must be separated\nfrom the [link destination] by spaces or tabs.\nNo further character may occur."
     - "A [link reference definition]\ndoes not correspond to a structural element of a document.  Instead, it\ndefines a label which can be used in [reference links]\nand reference-style [images] elsewhere in the document.  [Link\nreference definitions]"
     - " can come either before or after the links that use\nthem.\n\n```````````````````````````````` example\n[foo]: /url \"title\"\n\n[foo]\n.\n

    foo

    \n````````````````````````````````" - "```````````````````````````````` example\n [foo]: \n /url \n 'the title' \n\n[foo]\n.\n

    foo

    \n````````````````````````````````" @@ -402,10 +402,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n\n\n[Link reference definitions] can occur\ninside block containers, like lists and block quotations. They\naffect the entire document, not just the container in which they\nare defined:" - "```````````````````````````````` example\n[foo]\n\n> [foo]: /url\n.\n

    foo

    \n
    \n
    \n````````````````````````````````" - "## Paragraphs" -- "A sequence of non-blank lines that cannot be interpreted as other\nkinds of blocks forms a [paragraph](@)" -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph's raw content as inlines. The paragraph's raw content\nis formed by concatenating the lines and removing initial and final\nspaces or tabs.\n\nA simple example with two paragraphs:" -- "```````````````````````````````` example\naaa\n\nbbb\n.\n

    aaa

    \n

    bbb

    \n````````````````````````````````\n\n\nParagraphs can contain multiple lines, but no blank lines:" -- "```````````````````````````````` example\naaa\nbbb\n\nccc\nddd\n.\n

    aaa\nbbb

    \n

    ccc\nddd

    \n````````````````````````````````\n\n\nMultiple blank lines between paragraphs have no effect:" +- "A sequence of non-blank lines that cannot be interpreted as other\nkinds of blocks forms a [paragraph](@).\nThe contents of the paragraph are the result of parsing the\nparagraph's raw content as inlines. The paragraph's raw content" +- "is formed by concatenating the lines and removing initial and final\nspaces or tabs.\n\nA simple example with two paragraphs:\n\n```````````````````````````````` example\naaa\n\nbbb\n.\n

    aaa

    \n

    bbb

    \n````````````````````````````````" +- "Paragraphs can contain multiple lines, but no blank lines:\n\n```````````````````````````````` example\naaa\nbbb\n\nccc\nddd\n.\n

    aaa\nbbb

    \n

    ccc\nddd

    \n````````````````````````````````\n\n\nMultiple blank lines between paragraphs have no effect:" - "```````````````````````````````` example\naaa\n\n\nbbb\n.\n

    aaa

    \n

    bbb

    \n````````````````````````````````\n\n\nLeading spaces or tabs are skipped:\n\n```````````````````````````````` example\n aaa\n bbb\n.\n

    aaa\nbbb

    \n````````````````````````````````" - "Lines after the first may be indented any amount, since indented\ncode blocks cannot interrupt paragraphs." - "```````````````````````````````` example\naaa\n bbb\n ccc\n.\n

    aaa\nbbb\nccc

    \n````````````````````````````````" @@ -423,10 +422,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The following rules define [block quotes]:" - "1. **Basic case.** If a string of lines *Ls* constitute a sequence\n of blocks *Bs*, then the result of prepending a [block quote\n marker] to the beginning of each line in *Ls*\n is a [block quote](#block-quotes) containing *Bs*." - "2." -- "**Laziness.** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*" -- ", then the result of deleting\n the initial [block quote marker] from one or\n more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs*" -- " as its content.\n [Paragraph continuation text](@) is text\n that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph." -- "3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them." +- "**Laziness.** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*, then the result of deleting\n the initial [block quote marker] from one or" +- "more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs* as its content.\n [Paragraph continuation text](@) is text" +- "that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph.\n\n3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them." - "Nothing else counts as a [block quote](#block-quotes).\n\nHere is a simple example:\n\n```````````````````````````````` example\n> # Foo\n> bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "The space or tab after the `>` characters can be omitted:\n\n```````````````````````````````` example\n># Foo\n>bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "The `>` characters can be preceded by up to three spaces of indentation:\n\n```````````````````````````````` example\n > # Foo\n > bar\n > baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" @@ -461,16 +459,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "An [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)" - "The following rules define [list items]:" - "1." -- "**Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N*" -- " ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs*" -- " as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" -- "1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls*" -- " must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." +- "**Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation," +- "then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item" +- "(bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" +- "1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if" +- "the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." - "For example, let *Ls* be the lines" - "```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    " - "````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:" -- "```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      " -- "
    2. \n
    \n````````````````````````````````" +- "```````````````````````````````` example" +- "1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````" - "The most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list" - "marker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem." - "Here are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````" @@ -493,9 +492,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A start number may begin with 0s:\n\n```````````````````````````````` example\n0. ok\n.\n
      \n
    1. ok
    2. \n
    \n````````````````````````````````" - "```````````````````````````````` example\n003. ok\n.\n
      \n
    1. ok
    2. \n
    \n````````````````````````````````\n\n\nA start number may not be negative:" - "```````````````````````````````` example\n-1. not ok\n.\n

    -1. not ok

    \n````````````````````````````````" -- "2. **Item starting with indented code.** If a sequence of lines *Ls*\n constitute a sequence of blocks *Bs* starting with an indented code\n block, and *M* is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a" +- "2. **Item starting with indented code.** If a sequence of lines *Ls*\n constitute a sequence of blocks *Bs* starting with an indented code\n block, and *M* is a list marker of width *W* followed by" +- "one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents." +- "If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a" - "start number, based on the ordered list marker.\n\nAn indented code block will have to be preceded by four spaces of indentation\nbeyond the edge of the region where text will be included in the list item.\nIn the following case that is 6 spaces:" - "```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n
      bar\n
      \n
    • \n
    \n````````````````````````````````\n\n\nAnd in this case it is 11 spaces:" - "```````````````````````````````` example\n 10. foo\n\n bar\n.\n
      \n
    1. \n

      foo

      \n
      bar\n
      \n
    2. \n
    \n````````````````````````````````" @@ -510,9 +509,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not a significant restriction, because when a block is preceded by up to\nthree spaces of indentation, the indentation can always be removed without\na change in interpretation, allowing rule #1 to be applied. So, in\nthe above case:" - "```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````" - "3. **Item starting with a blank line.** If a sequence of lines *Ls*\n starting with a single [blank line] constitute a (possibly empty)\n sequence of blocks *Bs*, and *M* is a list marker of width *W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a" -- "start number, based on the ordered list marker.\n\nHere are some list items that start with a blank line but are not empty:" +- " to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs* as its contents.\n If a line is empty, then it need not be indented. The type of the" +- "list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n start number, based on the ordered list marker." +- "Here are some list items that start with a blank line but are not empty:" - "```````````````````````````````` example\n-\n foo\n-\n ```\n bar\n ```\n-\n baz\n.\n
      \n
    • foo
    • \n
    • \n
      bar\n
      \n
    • \n
    • \n
      baz\n
      \n
    • \n
    \n````````````````````````````````" - "When the list item starts with a blank line, the number of spaces\nfollowing the list marker doesn't change the required indentation:\n\n```````````````````````````````` example\n- \n foo\n.\n
      \n
    • foo
    • \n
    \n````````````````````````````````" - "A list item can begin with at most one blank line.\nIn the following example, `foo` is not part of the list\nitem:\n\n```````````````````````````````` example\n-\n\n foo\n.\n
      \n
    • \n
    \n

    foo

    \n````````````````````````````````" @@ -521,22 +520,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n1. foo\n2.\n3. bar\n.\n
      \n
    1. foo
    2. \n
    3. \n
    4. bar
    5. \n
    \n````````````````````````````````\n\n\nA list may start or end with an empty list item:" - "```````````````````````````````` example\n*\n.\n
      \n
    • \n
    \n````````````````````````````````\n\nHowever, an empty list item cannot interrupt a paragraph:" - "```````````````````````````````` example\nfoo\n*\n\nfoo\n1.\n.\n

    foo\n*

    \n

    foo\n1.

    \n````````````````````````````````" -- "4. **Indentation.** If a sequence of lines *Ls* constitutes a list item\n according to rule #1, #2, or #3, then the result of preceding each line\n of *Ls*" -- " by up to three spaces of indentation (the same for each line) also\n constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented two spaces:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented three spaces:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nFour spaces indent gives a code block:" +- "4. **Indentation.** If a sequence of lines *Ls* constitutes a list item\n according to rule #1, #2, or #3, then the result of preceding each line\n of *Ls* by up to three spaces of indentation (the same for each line) also" +- "constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:" +- "```````````````````````````````` example" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nIndented two spaces:" +- "```````````````````````````````` example" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nIndented three spaces:" +- "```````````````````````````````` example" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nFour spaces indent gives a code block:" - "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
    1.  A paragraph\n    with two lines.\n\n        indented code\n\n    > A block quote.\n
    " - "````````````````````````````````" -- "5. **Laziness.** If a string of lines *Ls* constitute a [list\n item](#list-items) with contents *Bs*" -- ", then the result of deleting\n some or all of the indentation from one or more lines in which the\n next character other than a space or tab after the indentation is\n [paragraph continuation text] is a" -- "list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s.\n\nHere is an example with [lazy continuation lines]:" -- "```````````````````````````````` example\n 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndentation can be partially deleted:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````\n\n\nThese examples show how laziness can work in nested structures:" +- "5. **Laziness.** If a string of lines *Ls* constitute a [list\n item](#list-items) with contents *Bs*, then the result of deleting\n some or all of the indentation from one or more lines in which the" +- "next character other than a space or tab after the indentation is\n [paragraph continuation text] is a\n list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s." +- "Here is an example with [lazy continuation lines]:" +- "```````````````````````````````` example" +- " 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nIndentation can be partially deleted:\n\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````" +- "These examples show how laziness can work in nested structures:" - "```````````````````````````````` example\n> 1. > Blockquote\ncontinued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````" - "```````````````````````````````` example\n> 1. > Blockquote\n> continued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````" - "6. **That's all.** Nothing that is not counted as a list item by rules\n #1--5 counts as a [list item](#list-items)." @@ -557,8 +560,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "These rules specify that a paragraph under a list item must be indented\nfour spaces (presumably, from the left margin, rather than the start of\nthe list marker, but this is not said), and that code under a list item" - "must be indented eight spaces instead of the usual four. They also say\nthat a block quote must be indented, but not by how much; however, the\nexample given has four spaces indentation. Although nothing is said" - "about other kinds of block-level content, it is certainly reasonable to\ninfer that *all* block elements under a list item, including other\nlists, must be indented four spaces. This principle has been called the\n*four-space rule*." -- "The four-space rule is clear and principled, and if the reference\nimplementation `Markdown.pl` had followed it, it probably would have\nbecome the standard. However, `Markdown.pl`" -- " allowed paragraphs and\nsublists to start with only two spaces indentation, at least on the\nouter level. Worse, its behavior was inconsistent: a sublist of an\nouter-level list needed two spaces indentation, but a sublist of this" +- "The four-space rule is clear and principled, and if the reference\nimplementation `Markdown.pl` had followed it, it probably would have\nbecome the standard. However, `Markdown.pl` allowed paragraphs and" +- "sublists to start with only two spaces indentation, at least on the\nouter level. Worse, its behavior was inconsistent: a sublist of an\nouter-level list needed two spaces indentation, but a sublist of this" - "sublist needed three spaces. It is not surprising, then, that different\nimplementations of Markdown have developed very different rules for\ndetermining what comes under a list item. (Pandoc and python-Markdown,\nfor example, stuck with Gruber'" - "s syntax description and the four-space\nrule, while discount, redcarpet, marked, PHP Markdown, and others\nfollowed `Markdown.pl`'s behavior more closely.)" - "Unfortunately, given the divergences between implementations, there\nis no way to give a spec for list items that will be guaranteed not\nto break any existing documents. However, the spec given here should" @@ -568,24 +571,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This rule is superior, we claim, to any rule requiring a fixed level of\nindentation from the margin. The four-space rule is clear but\nunnatural. It is quite unintuitive that\n\n``` markdown\n- foo\n\n bar\n\n - baz\n```" - "should be parsed as two lists with an intervening paragraph,\n\n``` html\n
      \n
    • foo
    • \n
    \n

    bar

    \n
      \n
    • baz
    • \n
    \n```\n\nas the four-space rule demands, rather than a single list," - "``` html\n
      \n
    • \n

      foo

      \n

      bar

      \n
        \n
      • baz
      • \n
      \n
    • \n
    \n```\n\nThe choice of four spaces is arbitrary. It can be learned, but it is\nnot likely to be guessed, and it trips up beginners regularly." -- "Would it help to adopt a two-space rule? The problem is that such\na rule, together with the rule allowing up to three spaces of indentation for\nthe initial list marker, allows text that is indented *less than*" -- " the\noriginal list marker to be included in the list item. For example,\n`Markdown.pl` parses\n\n``` markdown\n - one\n\n two\n```\n\nas a single list item, with `two` a continuation paragraph:\n\n``` html\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n```" -- "and similarly\n\n``` markdown\n> - one\n>\n> two\n```\n\nas\n\n``` html\n
    \n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n
    \n```\n\nThis is extremely unintuitive." +- "Would it help to adopt a two-space rule? The problem is that such\na rule, together with the rule allowing up to three spaces of indentation for\nthe initial list marker, allows text that is indented *less than* the" +- "original list marker to be included in the list item. For example,\n`Markdown.pl` parses\n\n``` markdown\n - one\n\n two\n```\n\nas a single list item, with `two` a continuation paragraph:\n\n``` html\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n```\n\nand similarly" +- "``` markdown\n> - one\n>\n> two\n```\n\nas\n\n``` html\n
    \n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n
    \n```\n\nThis is extremely unintuitive." - "Rather than requiring a fixed indent from the margin, we could require\na fixed indent (say, two spaces, or even one space) from the list marker (which\nmay itself be indented). This proposal would remove the last anomaly" - "discussed. Unlike the spec presented above, it would count the following\nas a list item with a subparagraph, even though the paragraph `bar`\nis not indented as far as the first paragraph `foo`:\n\n``` markdown\n 10. foo\n\n bar \n```" -- "Arguably this text does read like a list item with `bar`" -- " as a subparagraph,\nwhich may count in favor of the proposal. However, on this proposal indented\ncode would have to be indented six spaces after the list marker. And this\nwould break a lot of existing Markdown, which has the pattern:" -- "``` markdown\n1. foo\n\n indented code\n```\n\nwhere the code is indented eight spaces. The spec above, by contrast, will\nparse this text as expected, since the code block's indentation is measured\nfrom the beginning of `foo`." -- The one case that needs special treatment is a list item that *starts* -- "with indented code. How much indentation is required in that case, since\nwe don't have a \"first paragraph\" to measure from? Rule #2 simply stipulates\nthat in such cases, we require one space indentation from the list marker" -- "(and then the normal four spaces for the indented code). This will match the\nfour-space rule in cases where the list marker plus its initial indentation\ntakes four spaces (a common case), but diverge in other cases." +- "Arguably this text does read like a list item with `bar` as a subparagraph,\nwhich may count in favor of the proposal. However, on this proposal indented\ncode would have to be indented six spaces after the list marker. And this" +- "would break a lot of existing Markdown, which has the pattern:\n\n``` markdown\n1. foo\n\n indented code\n```" +- "where the code is indented eight spaces. The spec above, by contrast, will\nparse this text as expected, since the code block's indentation is measured\nfrom the beginning of `foo`." +- "The one case that needs special treatment is a list item that *starts*\nwith indented code. How much indentation is required in that case, since\nwe don't have a \"first paragraph\" to measure from? Rule #2 simply stipulates" +- "that in such cases, we require one space indentation from the list marker\n(and then the normal four spaces for the indented code). This will match the\nfour-space rule in cases where the list marker plus its initial indentation" +- "takes four spaces (a common case), but diverge in other cases." - "## Lists\n\nA [list](@) is a sequence of one or more\nlist items [of the same type]. The list items\nmay be separated by any number of blank lines." - "Two list items are [of the same type](@)\nif they begin with a [list marker] of the same type.\nTwo list markers are of the\nsame type if (a) they are bullet list markers using the same character\n(`-`, `+`, or `*`" - ") or (b) they are ordered list numbers with the same\ndelimiter (either `.` or `)`)." - "A list is an [ordered list](@)\nif its constituent list items begin with\n[ordered list markers], and a\n[bullet list](@) if its constituent list\nitems begin with [bullet list markers]." - "The [start number](@)\nof an [ordered list] is determined by the list number of\nits initial list item. The numbers of subsequent list items are\ndisregarded." -- "A list is [loose](@) if any of its constituent\nlist items are separated by blank lines, or if any of its constituent\nlist items directly contain two block-level elements with a blank line\nbetween them. Otherwise a list is [tight](@)" -- ".\n(The difference in HTML output is that paragraphs in a loose list are\nwrapped in `

    ` tags, while paragraphs in a tight list are not.)\n\nChanging the bullet or ordered list delimiter starts a new list:" +- "A list is [loose](@) if any of its constituent\nlist items are separated by blank lines, or if any of its constituent\nlist items directly contain two block-level elements with a blank line\nbetween them. Otherwise a list is [tight](@)." +- "(The difference in HTML output is that paragraphs in a loose list are\nwrapped in `

    ` tags, while paragraphs in a tight list are not.)\n\nChanging the bullet or ordered list delimiter starts a new list:" - "```````````````````````````````` example\n- foo\n- bar\n+ baz\n.\n

      \n
    • foo
    • \n
    • bar
    • \n
    \n
      \n
    • baz
    • \n
    \n````````````````````````````````" - "```````````````````````````````` example\n1. foo\n2. bar\n3) baz\n.\n
      \n
    1. foo
    2. \n
    3. bar
    4. \n
    \n
      \n
    1. baz
    2. \n
    \n````````````````````````````````" - "In CommonMark, a list can interrupt a paragraph. That is,\nno blank line is needed to separate a paragraph from a following\nlist:" @@ -635,8 +638,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Code spans\n\nA [backtick string](@)\nis a string of one or more backtick characters (`` ` ``) that is neither\npreceded nor followed by a backtick." - "A [code span](@) begins with a backtick string and ends with\na backtick string of equal length. The contents of the code span are\nthe characters between these two backtick strings, normalized in the\nfollowing ways:" - "- First, [line endings] are converted to [spaces]." -- "- If the resulting string both begins *and*" -- " ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back. This allows you to include code that begins" +- "- If the resulting string both begins *and* ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back. This allows you to include code that begins" - "or ends with backtick characters, which must be separated by\n whitespace from the opening or closing backtick strings.\n\nThis is a simple code span:\n\n```````````````````````````````` example\n`foo`\n.\n

    foo

    \n````````````````````````````````" - "Here two backticks are used, because the code contains a backtick.\nThis example also illustrates stripping of a single leading and\ntrailing space:" - "```````````````````````````````` example\n`` foo ` bar ``\n.\n

    foo ` bar

    \n````````````````````````````````\n\n\nThis example shows the motivation for stripping leading and trailing\nspaces:" @@ -663,49 +665,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n`foo``bar``\n.\n

    `foobar

    \n````````````````````````````````" - "## Emphasis and strong emphasis\n\nJohn Gruber's original [Markdown syntax\ndescription](https://daringfireball.net/projects/markdown/syntax#em) says:" - "> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of\n> emphasis. Text wrapped with one `*` or `_` will be wrapped with an HTML\n> `` tag; double `*`'s or `_`'s will be wrapped with an HTML ``\n> tag." -- "This is enough for most users, but these rules leave much undecided,\nespecially when it comes to nested emphasis. The original\n`Markdown.pl` test suite makes it clear that triple `***` and\n`___`" -- " delimiters can be used for strong emphasis, and most\nimplementations have also allowed the following patterns:\n\n``` markdown\n***strong emph***\n***strong** in emph*\n***emph* in strong**\n**in strong *emph***\n*in emph **strong***\n```" +- "This is enough for most users, but these rules leave much undecided,\nespecially when it comes to nested emphasis. The original\n`Markdown.pl` test suite makes it clear that triple `***` and\n`___` delimiters can be used for strong emphasis, and most" +- "implementations have also allowed the following patterns:\n\n``` markdown\n***strong emph***\n***strong** in emph*\n***emph* in strong**\n**in strong *emph***\n*in emph **strong***\n```" - "The following patterns are less widely supported, but the intent\nis clear and they are useful (especially in contexts like bibliography\nentries):\n\n``` markdown\n*emph *with emph* in it*\n**strong **with strong** in it**\n```" - "Many implementations have also restricted intraword emphasis to\nthe `*` forms, to avoid unwanted emphasis in words containing\ninternal underscores. (It is best practice to put these in code\nspans, but users often do not.)" - "``` markdown\ninternal emphasis: foo*bar*baz\nno emphasis: foo_bar_baz\n```\n\nThe rules given below capture all of these patterns, while allowing\nfor efficient parsing strategies that do not backtrack." -- "First, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_`" -- " characters that is not preceded or followed by\na non-backslash-escaped `_` character." -- "A [left-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [" -- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace." -- "A [right-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [" -- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." -- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```" -- " abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" +- "First, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by" +- "a non-backslash-escaped `_` character." +- "A [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace." +- "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." +- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```" +- " - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's" -- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags)" -- ".\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" +- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\"" +- " and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" - "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run]." - "2." -- "A single `_`" -- " character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." -- "3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run]." +- "A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run]." - "4." -- "A single `_`" -- " character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." -- "5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run]." +- "A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run]." - "6." -- "A double `__`" -- " [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." -- "7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run]." +- "A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run]." - "8." -- "A double `__`" -- " [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character]." - "9." -- "Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`" -- ") as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the" -- "delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3." +- "Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate" +- "[delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are" +- multiples of 3. - "10." -- "Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`" -- ") as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of" -- "the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3." -- "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped." +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The" +- "opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing" +- "delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped." - "12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:" - "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`." @@ -776,9 +772,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n*foo *bar**\n.\n

    foo bar

    \n````````````````````````````````" - "```````````````````````````````` example\n*foo **bar** baz*\n.\n

    foo bar baz

    \n````````````````````````````````" - "```````````````````````````````` example\n*foo**bar**baz*\n.\n

    foobarbaz

    \n````````````````````````````````\n\nNote that in the preceding case, the interpretation\n\n``` markdown\n

    foobarbaz

    \n```" -- "is precluded by the condition that a delimiter that\ncan both open and close (like the `*` after `foo`" -- ")\ncannot form emphasis if the sum of the lengths of\nthe delimiter runs containing the opening and\nclosing delimiters is a multiple of 3 unless\nboth lengths are multiples of 3." -- "For the same reason, we don't get two consecutive\nemphasis sections in this example:\n\n```````````````````````````````` example\n*foo**bar*\n.\n

    foo**bar

    \n````````````````````````````````" +- "is precluded by the condition that a delimiter that\ncan both open and close (like the `*` after `foo`)\ncannot form emphasis if the sum of the lengths of\nthe delimiter runs containing the opening and\nclosing delimiters is a multiple of 3 unless" +- "both lengths are multiples of 3.\n\n\nFor the same reason, we don't get two consecutive\nemphasis sections in this example:\n\n```````````````````````````````` example\n*foo**bar*\n.\n

    foo**bar

    \n````````````````````````````````" - "The same condition ensures that the following\ncases are all strong emphasis nested inside\nemphasis, even when the interior whitespace is\nomitted:" - "```````````````````````````````` example\n***foo** bar*\n.\n

    foo bar

    \n````````````````````````````````" - "```````````````````````````````` example\n*foo **bar***\n.\n

    foo bar

    \n````````````````````````````````" @@ -846,17 +841,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis]. Thus, for example, `*[foo*](url)` is a link.\n\nA [link destination](@) consists of either" - "- a sequence of zero or more characters between an opening `<` and a\n closing `>` that contains no line endings or unescaped\n `<` or `>` characters, or" - "-" -- "a nonempty sequence of characters that does not start with `<`" -- ",\n does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are\n backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses." -- "(Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)\n\nA [link title](@) consists of either" +- "a nonempty sequence of characters that does not start with `<`,\n does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are" +- "backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses.\n (Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)" +- "A [link title](@) consists of either" - "- a sequence of zero or more characters between straight double-quote\n characters (`\"`), including a `\"` character only if it is\n backslash-escaped, or" - "- a sequence of zero or more characters between straight single-quote\n characters (`'`), including a `'` character only if it is\n backslash-escaped, or" - "- a sequence of zero or more characters between matching parentheses\n (`(...)`), including a `(` or `)` character only if it is\n backslash-escaped." - "Although [link titles] may span multiple lines, they may not contain\na [blank line]." -- "An [inline link](@) consists of a [link text] followed immediately\nby a left parenthesis `(`, an optional [link destination], an optional\n[link title], and a right parenthesis `)`" -- ".\nThese four components may be separated by spaces, tabs, and up to one line\nending.\nIf both [link destination] and [link title] are present, they *must* be\nseparated by spaces, tabs, and up to one line ending." -- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>`" -- " if present, with backslash-escapes in effect as described\nabove. The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:" +- "An [inline link](@) consists of a [link text] followed immediately\nby a left parenthesis `(`, an optional [link destination], an optional\n[link title], and a right parenthesis `)`.\nThese four components may be separated by spaces, tabs, and up to one line" +- "ending.\nIf both [link destination] and [link title] are present, they *must* be\nseparated by spaces, tabs, and up to one line ending." +- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>` if present, with backslash-escapes in effect as described" +- "above. The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:" - "```````````````````````````````` example\n[link](/uri \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nThe title, the link text and even \nthe destination may be omitted:" - "```````````````````````````````` example\n[link](/uri)\n.\n

    link

    \n````````````````````````````````\n\n```````````````````````````````` example\n[](./target.md)\n.\n

    \n````````````````````````````````" - "```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[link](<>)\n.\n

    link

    \n````````````````````````````````" @@ -873,9 +868,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n[link](foo\\(and\\(bar\\))\n.\n

    link

    \n````````````````````````````````" - "```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\nParentheses and other symbols can also be escaped, as usual\nin Markdown:" - "```````````````````````````````` example\n[link](foo\\)\\:)\n.\n

    link

    \n````````````````````````````````\n\n\nA link can contain fragment identifiers and queries:" -- "```````````````````````````````` example\n[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    " -- "

    link

    \n````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:" -- "```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" +- "```````````````````````````````` example" +- "[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    \n

    link

    " +- "````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:\n\n```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" - "URL-escaping should be left alone inside the destination, as all\nURL-escaped characters are also valid URL characters. Entity and\nnumerical character references in the destination will be parsed" - "into the corresponding Unicode code points, as usual. These may\nbe optionally URL-escaped when written as HTML, but this spec\ndoes not enforce any particular policy for rendering URLs in\nHTML or other formats. Renderers may make different decisions" - "about how to escape or normalize URLs in the output.\n\n```````````````````````````````` example\n[link](foo%20bä)\n.\n

    link

    \n````````````````````````````````" @@ -888,11 +883,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n[link](/url \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nNested balanced quotes are not allowed without escaping:" - "```````````````````````````````` example\n[link](/url \"title \"and\" title\")\n.\n

    [link](/url "title "and" title")

    \n````````````````````````````````\n\n\nBut it is easy to work around this by using a different quote type:" - "```````````````````````````````` example\n[link](/url 'title \"and\" title')\n.\n

    link

    \n````````````````````````````````" -- "(Note: `Markdown.pl`" -- " did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this\nbrings, since there are already many ways---backslash escaping," -- "entity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`" -- "'s handling of titles has a number\nof other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`." -- "`Markdown.pl` 1.0.1 even allows\ntitles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)" +- "(Note: `Markdown.pl` did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this\nbrings, since there are already many ways---" +- "backslash escaping,\nentity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`'s handling of titles has a number" +- "of other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`. `Markdown.pl` 1.0.1 even allows" +- "titles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)" - "Spaces, tabs, and up to one line ending is allowed around the destination and\ntitle:\n\n```````````````````````````````` example\n[link]( /uri\n \"title\" )\n.\n

    link

    \n````````````````````````````````" - "But it is not allowed between the link text and the\nfollowing parenthesis:\n\n```````````````````````````````` example\n[link] (/uri)\n.\n

    [link] (/uri)

    \n````````````````````````````````" - "The link text may contain balanced brackets, but not unbalanced ones,\nunless they are escaped:\n\n```````````````````````````````` example\n[link [foo [bar]]](/uri)\n.\n

    link [foo [bar]]

    \n````````````````````````````````" @@ -910,11 +904,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n[foo\n.\n

    [foohttps://example.com/?search=](uri)

    \n````````````````````````````````" - "There are three kinds of [reference link](@)s:\n[full](#full-reference-link), [collapsed](#collapsed-reference-link),\nand [shortcut](#shortcut-reference-link)." - "A [full reference link](@)\nconsists of a [link text] immediately followed by a [link label]\nthat [matches] a [link reference definition] elsewhere in the document." -- "A [link label](@) begins with a left bracket (`[`) and ends\nwith the first right bracket (`]`" -- ") that is not backslash-escaped.\nBetween these brackets there must be at least one character that is not a space,\ntab, or line ending.\nUnescaped square bracket characters are not allowed inside the\nopening and closing square brackets of [link labels]" -- ". A link\nlabel can have at most 999 characters inside the square\nbrackets." -- "One label [matches](@)\nanother just in case their normalized forms are equal. To normalize a\nlabel, strip off the opening and closing brackets,\nperform the *Unicode case fold*" -- ", strip leading and trailing\nspaces, tabs, and line endings, and collapse consecutive internal\nspaces, tabs, and line endings to a single space. If there are multiple\nmatching reference link definitions, the one that comes first in the" +- "A [link label](@) begins with a left bracket (`[`) and ends\nwith the first right bracket (`]`) that is not backslash-escaped.\nBetween these brackets there must be at least one character that is not a space,\ntab, or line ending." +- "Unescaped square bracket characters are not allowed inside the\nopening and closing square brackets of [link labels]. A link\nlabel can have at most 999 characters inside the square\nbrackets." +- "One label [matches](@)\nanother just in case their normalized forms are equal. To normalize a\nlabel, strip off the opening and closing brackets,\nperform the *Unicode case fold*, strip leading and trailing" +- "spaces, tabs, and line endings, and collapse consecutive internal\nspaces, tabs, and line endings to a single space. If there are multiple\nmatching reference link definitions, the one that comes first in the" - "document is used. (It is desirable in such cases to emit a warning.)\n\nThe link's URI and title are provided by the matching [link\nreference definition].\n\nHere is a simple example:" - "```````````````````````````````` example\n[foo][bar]\n\n[bar]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````\n\n\nThe rules for the [link text] are the same as with\n[inline links]. Thus:" - "The link text may contain balanced brackets, but not unbalanced ones,\nunless they are escaped:\n\n```````````````````````````````` example\n[link [foo [bar]]][ref]\n\n[ref]: /uri\n.\n

    link [foo [bar]]

    \n````````````````````````````````" @@ -937,9 +930,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is a departure from John Gruber's original Markdown syntax\ndescription, which explicitly allows whitespace between the link\ntext and the link label. It brings reference links in line with\n[inline links], which (according to both original Markdown and" - "this spec) cannot have whitespace after the link text. More\nimportantly, it prevents inadvertent capture of consecutive\n[shortcut reference links]. If whitespace is allowed between the\nlink text and the link label, then in the following we will have" - "a single reference link, not two shortcut reference links, as\nintended:\n\n``` markdown\n[foo]\n[bar]\n\n[foo]: /url1\n[bar]: /url2\n```" -- "(Note that [shortcut reference links] were introduced by Gruber\nhimself in a beta version of `Markdown.pl`" -- ", but never included\nin the official syntax description. Without shortcut reference\nlinks, it is harmless to allow space between the link text and\nlink label; but once shortcut references are introduced, it is" -- "too dangerous to allow this, as it frequently leads to\nunintended results.)\n\nWhen there are multiple matching [link reference definitions],\nthe first is used:" +- "(Note that [shortcut reference links] were introduced by Gruber\nhimself in a beta version of `Markdown.pl`, but never included\nin the official syntax description. Without shortcut reference\nlinks, it is harmless to allow space between the link text and" +- "link label; but once shortcut references are introduced, it is\ntoo dangerous to allow this, as it frequently leads to\nunintended results.)\n\nWhen there are multiple matching [link reference definitions],\nthe first is used:" - "```````````````````````````````` example\n[foo]: /url1\n\n[foo]: /url2\n\n[bar][foo]\n.\n

    bar

    \n````````````````````````````````" - "Note that matching is performed on normalized strings, not parsed\ninline content. So the following does not match, even though the\nlabels define equivalent inline content:" - "```````````````````````````````` example\n[bar][foo\\!]\n\n[foo!]: /url\n.\n

    [bar][foo!]

    \n````````````````````````````````\n\n\n[Link labels] cannot contain brackets, unless they are\nbackslash-escaped:" @@ -949,15 +941,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n[foo][ref\\[]\n\n[ref\\[]: /uri\n.\n

    foo

    \n````````````````````````````````\n\n\nNote that in this example `]` is not backslash-escaped:" - "```````````````````````````````` example\n[bar\\\\]: /uri\n\n[bar\\\\]\n.\n

    bar\\

    \n````````````````````````````````\n\n\nA [link label] must contain at least one character that is not a space, tab, or\nline ending:" - "```````````````````````````````` example\n[]\n\n[]: /uri\n.\n

    []

    \n

    []: /uri

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [\n]: /uri

    \n````````````````````````````````" -- "A [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`" -- ".\nThe contents of the link label are parsed as inlines,\nwhich are used as the link's text. The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`." +- "A [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link'" +- "s text. The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`." - "```````````````````````````````` example\n[foo][]\n\n[foo]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````" - "```````````````````````````````` example\n[*foo* bar][]\n\n[*foo* bar]: /url \"title\"\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThe link labels are case-insensitive:" - "```````````````````````````````` example\n[Foo][]\n\n[foo]: /url \"title\"\n.\n

    Foo

    \n````````````````````````````````" - "As with full reference links, spaces, tabs, or line endings are not\nallowed between the two sets of brackets:" - "```````````````````````````````` example\n[foo] \n[]\n\n[foo]: /url \"title\"\n.\n

    foo\n[]

    \n````````````````````````````````" -- "A [shortcut reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument and is not followed by `[]`" -- " or a link label.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link's text. The link's URI and title\nare provided by the matching link reference definition.\nThus, `[foo]` is equivalent to `[foo][]`." +- "A [shortcut reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument and is not followed by `[]` or a link label.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link'" +- "s text. The link's URI and title\nare provided by the matching link reference definition.\nThus, `[foo]` is equivalent to `[foo][]`." - "```````````````````````````````` example\n[foo]\n\n[foo]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````" - "```````````````````````````````` example\n[*foo* bar]\n\n[*foo* bar]: /url \"title\"\n.\n

    foo bar

    \n````````````````````````````````" - "```````````````````````````````` example\n[[*foo* bar]]\n\n[*foo* bar]: /url \"title\"\n.\n

    [foo bar]

    \n````````````````````````````````" @@ -975,8 +967,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Here `[foo]` is not parsed as a shortcut reference, because it\nis followed by a link label (even though `[bar]` is not defined):" - "```````````````````````````````` example\n[foo][bar][baz]\n\n[baz]: /url1\n[foo]: /url2\n.\n

    [foo]bar

    \n````````````````````````````````" - "## Images" -- "Syntax for images is like the syntax for links, with one\ndifference. Instead of [link text], we have an\n[image description](@). The rules for this are the\nsame as for [link text], except that (a) an\nimage description starts with `![` rather than `[`" -- ", and\n(b) an image description may contain links.\nAn image description has inline elements\nas its contents. When an image is rendered to HTML,\nthis is standardly used as the image's `alt` attribute." +- "Syntax for images is like the syntax for links, with one\ndifference. Instead of [link text], we have an\n[image description](@). The rules for this are the\nsame as for [link text], except that (a) an\nimage description starts with `![` rather than `[`, and" +- "(b) an image description may contain links.\nAn image description has inline elements\nas its contents. When an image is rendered to HTML,\nthis is standardly used as the image's `alt` attribute." - "```````````````````````````````` example\n![foo](/url \"title\")\n.\n

    \"foo\"

    \n````````````````````````````````" - "```````````````````````````````` example\n![foo *bar*]\n\n[foo *bar*]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "```````````````````````````````` example\n![foo ![bar](/url)](/url2)\n.\n

    \"foo

    \n````````````````````````````````" @@ -1003,8 +995,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n\\![foo]\n\n[foo]: /url \"title\"\n.\n

    !foo

    \n````````````````````````````````" - "## Autolinks\n\n[Autolink](@)s are absolute URIs and email addresses inside\n`<` and `>`. They are parsed as links, with the URL or email address\nas the link label." - "A [URI autolink](@) consists of `<`, followed by an\n[absolute URI] followed by `>`. It is parsed as\na link to the URI, with the URI as the link's label." -- "An [absolute URI](@),\nfor these purposes, consists of a [scheme] followed by a colon (`:`)\nfollowed by zero or more characters other than [ASCII control\ncharacters][ASCII control character], [space], `<`, and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded\n(e.g. `%20` for a space)." +- "An [absolute URI](@),\nfor these purposes, consists of a [scheme] followed by a colon (`:`)\nfollowed by zero or more characters other than [ASCII control\ncharacters][ASCII control character], [space], `<`, and `>`." +- "If the URI includes these characters, they must be percent-encoded\n(e.g. `%20` for a space)." - "For purposes of this spec, a [scheme](@) is any sequence\nof 2--32 characters beginning with an ASCII letter and followed\nby any combination of ASCII letters, digits, or the symbols plus\n(\"+\"), period (\".\"), or hyphen (\"-\").\n\nHere are some valid autolinks:" - "```````````````````````````````` example\n\n.\n

    http://foo.bar.baz

    \n````````````````````````````````" - "```````````````````````````````` example\n\n.\n

    https://foo.bar.baz/test?q=hello&id=22&boolean

    \n````````````````````````````````" @@ -1073,33 +1065,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\nfoo \n.\n

    foo

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n### foo\\\n.\n

    foo\\

    \n````````````````````````````````" - "```````````````````````````````` example\n### foo \n.\n

    foo

    \n````````````````````````````````" - "## Soft line breaks" -- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@)" -- ". (A soft line break may be rendered in HTML either as a\n[line ending] or as a space. The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)" -- "```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````\n\n\nSpaces at the end of the line and beginning of the next line are\nremoved:" -- "```````````````````````````````` example\nfoo \n baz\n.\n

    foo\nbaz

    \n````````````````````````````````\n\n\nA conforming parser may render a soft line break in HTML either as a\nline ending or as a space." -- "A renderer may also provide an option to render soft line breaks\nas hard line breaks." +- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@). (A soft line break may be rendered in HTML either as a\n[line ending]" +- " or as a space. The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)\n\n```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````" +- "Spaces at the end of the line and beginning of the next line are\nremoved:\n\n```````````````````````````````` example\nfoo \n baz\n.\n

    foo\nbaz

    \n````````````````````````````````" +- "A conforming parser may render a soft line break in HTML either as a\nline ending or as a space.\n\nA renderer may also provide an option to render soft line breaks\nas hard line breaks." - "## Textual content\n\nAny characters not given an interpretation by the above rules will\nbe parsed as plain textual content.\n\n```````````````````````````````` example\nhello $.;'there\n.\n

    hello $.;'there

    \n````````````````````````````````" - "```````````````````````````````` example\nFoo χρῆν\n.\n

    Foo χρῆν

    \n````````````````````````````````\n\n\nInternal spaces are preserved verbatim:" - "```````````````````````````````` example\nMultiple spaces\n.\n

    Multiple spaces

    \n````````````````````````````````\n\n\n" - "# Appendix: A parsing strategy\n\nIn this appendix we describe some features of the parsing strategy\nused in the CommonMark reference implementations." - "## Overview\n\nParsing has two phases:" -- "1. In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these" +- "1." +- "In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these" - "blocks but not parsed. Link reference definitions are parsed and a\nmap of links is constructed." - "2. In the second phase, the raw text contents of paragraphs and headings\nare parsed into sequences of Markdown inline elements (strings,\ncode spans, links, emphasis, and so on), using the map of link\nreferences constructed in phase 1." -- "At each point in processing, the document is represented as a tree of\n**blocks**. The root of the tree is a `document` block. The `document`\nmay have any number of other blocks as **children**" -- ". These children\nmay, in turn, have other blocks as children. The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**" -- ".)\nHere, for example, is a possible document tree, with the open blocks\nmarked by arrows:" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item" -- " -> paragraph\n \"aliquando id\"\n```" +- "At each point in processing, the document is represented as a tree of\n**blocks**. The root of the tree is a `document` block. The `document`\nmay have any number of other blocks as **children**. These children" +- "may, in turn, have other blocks as children. The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**.)" +- "Here, for example, is a possible document tree, with the open blocks\nmarked by arrows:" +- "``` tree" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph" +- " \"aliquando id\"\n```" - "## Phase 1: block structure\n\nEach line that is processed has an effect on this tree. The line is\nanalyzed and, depending on its contents, the document may be altered\nin one or more of the following ways:" - "1. One or more open blocks may be closed.\n2. One or more new blocks may be created as children of the\n last open block.\n3. Text may be added to the last (deepest) open block remaining\n on the tree." - "Once a line has been incorporated into the tree in this way,\nit can be discarded, so input can be read in a stream.\n\nFor each line, we follow this procedure:" -- "1. First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy" -- "if the block is to remain open. For example, a block quote requires a\n`>`" -- " character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[lazy continuation line]." +- "1." +- "First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy" +- "if the block is to remain open. For example, a block quote requires a\n`>` character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[" +- "lazy continuation line]." - "2." -- "Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>`" -- " for a block quote).\nIf we encounter a new block start, we close any blocks unmatched\nin step 1 before creating the new block as a child of the last\nmatched container block." +- "Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>` for a block quote).\nIf we encounter a new block start, we close any blocks unmatched\nin step 1 before creating the new block as a child of the last" +- matched container block. - "3. Finally, we look at the remainder of the line (after block\nmarkers like `>`, list markers, and indentation have been consumed).\nThis is text that can be incorporated into the last open\nblock (a paragraph, code block, heading, or raw HTML)." - "Setext headings are formed when we see a line of a paragraph\nthat is a [setext heading underline]." - "Reference link definitions are detected when a paragraph is closed;\nthe accumulated text lines are parsed to see if they begin with\none or more reference link definitions. Any remainder becomes a\nnormal paragraph." @@ -1112,11 +1106,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ":\n\n``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n -> list_item\n -> paragraph\n \"Qui *quodsi iracundia*\"\n```\n\nThe fourth line," - "``` markdown\n> - aliquando id\n```" - "causes the `list_item` (and its child the `paragraph`) to be closed,\nand a new `list_item` opened up as child of the `list`. A `paragraph`\nis added as a child of the new `list_item`, to contain the text.\nWe thus obtain the final tree:" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item" -- " -> paragraph\n \"aliquando id\"\n```" +- "``` tree" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph" +- " \"aliquando id\"\n```" - "## Phase 2: inline structure\n\nOnce all of the input has been parsed, all open blocks are closed." - "We then \"walk the tree,\" visiting every node, and parse raw\nstring contents of paragraphs and headings as inlines. At this\npoint we have seen all the link reference definitions, so we can\nresolve reference links as we go." -- "``` tree\ndocument\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph" +- "``` tree" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item\n paragraph\n str \"aliquando id\"\n```\n\nNotice how the [line ending] in the first paragraph has\nbeen parsed as a `softbreak`, and the asterisks in the first list item\nhave become an `emph`." - "### An algorithm for parsing nested emphasis and links\n\nBy far the trickiest part of inline parsing is handling emphasis,\nstrong emphasis, links, and images. This is done using the following\nalgorithm.\n\nWhen we're parsing inlines and we hit either" - "- a run of `*` or `_` characters, or\n- a `[` or `![`" @@ -1127,9 +1123,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter." - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`." - "- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image." -- " + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`." +- "+ If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`." - "+ If we do, then" -- " * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`." +- "* We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`." - " * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`." - "Let `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL)." @@ -1139,7 +1135,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:" - "+ Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener." - " + Remove any delimiters between the opener and closer from\n the delimiter stack." -- " + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element" +- + +- "Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element" - "of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:" - "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)" - " + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack." diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap index bada4b5..345adb0 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap @@ -19,8 +19,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - org/licenses/by- - "sa/4.0/)'\n..." - "# Introduction" -- "## What is" -- Markdown? +- "##" +- What is Markdown +- "?" - Markdown is a - plain text - format for @@ -48,9 +49,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - et/projects/ - markdown/syntax) - and a Perl -- "script (`" -- "Markdown.pl`)" -- for converting +- script ( +- "`Markdown.pl`" +- ) for converting - Markdown to - HTML. - In the next @@ -101,11 +102,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to write, is its" - readability. - As Gruber writes -- ":\n\n>" +- ":" +- ">" - The overriding - design goal for -- "Markdown's" -- formatting +- "Markdown'" +- s formatting - "syntax is\n>" - to make it as - readable as @@ -119,12 +121,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as-is, as\n>" - "plain text," - without looking -- "like it's been" -- marked up with -- "tags\n>" +- "like it'" +- s been marked up +- "with tags\n>" - or formatting - "instructions.\n>" -- "() @@ -143,7 +146,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - AsciiDoc from - the AsciiDoc - "manual:" -- "```\n1." +- "```" +- "1." - "List item one.\n+" - List item one - continued with a @@ -185,7 +189,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - And here is the - equivalent in - "Markdown:" -- "```\n1." +- "```" +- "1." - List item one. - List item one - continued with a @@ -237,13 +242,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "source, not just" - in the processed - document. -- "## Why is a spec" +- "##" +- Why is a spec - needed? -- "John Gruber's [" +- "John Gruber's" +- "[" - canonical - description of - "Markdown's" -- "syntax](https://" +- syntax +- "](https://" - daringfireball.n - et/projects/ - markdown/syntax) @@ -273,8 +281,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - think that - "they, too, must" - be indented four -- "spaces, but `" -- "Markdown.pl`" +- "spaces, but" +- "`Markdown.pl`" - does - not require that - "." @@ -287,10 +295,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - often lead to - surprises for - users in real -- documents. -- "(See [this" -- comment by John -- "Gruber](https://" +- documents. (See +- "[" +- this comment by +- "John\n Gruber" +- "](https://" - web.archive.org/ - web/ - 20170611172104/ @@ -326,8 +335,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - while others do - not). - (John Gruber has -- "also spoken [in" -- favor of +- also spoken +- "[" +- in favor of - requiring the - "blank\n lines" - "](https://" @@ -357,7 +367,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown" - paragraph - code? -- " ```\n\n4." +- "```" +- "4." - What is the - exact rule for - determining when @@ -492,7 +503,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - this - "- and it can" - "screw things up`" -- " ```\n\n11." +- "```" +- "11." - Can list items - include section - headings? ( @@ -510,7 +522,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Can list items - be empty? - "``` markdown" -- " * a\n *" +- "* a\n *" - "* b\n ```" - "13." - Can link @@ -538,13 +550,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In the absence - "of a spec, early" - implementers -- "consulted `" -- "Markdown.pl`" +- consulted +- "`Markdown.pl`" - to resolve these - ambiguities. -- "But `Markdown.pl" -- "` was quite" -- "buggy, and" +- But +- "`Markdown.pl`" +- "was quite buggy," +- and - gave manifestly - bad results in - "many cases, so" @@ -577,12 +590,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "worse, because" - nothing in - Markdown counts -- "as a \"syntax" -- "error,\" the" -- divergence often -- "isn't discovered" +- "as a \"" +- "syntax error,\"" +- the divergence +- "often isn'" +- t discovered - right away. -- "## About this" +- "##" +- About this - document - This document - attempts to @@ -600,8 +615,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - conformance - tests. An - accompanying -- "script `" -- "spec_tests.py`" +- script +- "`spec_tests.py`" - can be used to - run the tests - against any @@ -653,9 +668,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - spec says what - counts as a link - "destination, but" -- "it doesn't" -- mandate that non -- "-ASCII" +- "it doesn'" +- t mandate that +- non-ASCII - characters in - the URL be - percent-encoded. @@ -685,20 +700,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - URLs. - This document is - generated from a -- "text file, `" -- "spec.txt`," -- written +- "text file," +- "`spec.txt`" +- ", written" - in Markdown with - a small - extension for - the side-by-side - tests. -- "The script `" -- tools/ -- "makespec.py` can" -- be used to -- "convert `" -- "spec.txt` into" +- The script +- "`tools/" +- "makespec.py`" +- can be used to +- convert +- "`spec.txt` into" - HTML or - CommonMark ( - which can then @@ -711,11 +726,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - used to - represent tabs. - "# Preliminaries" -- "## Characters" -- and lines +- "##" +- Characters and +- lines - Any sequence of -- "[characters] is" -- a valid +- "[characters]" +- is a valid - CommonMark - document. - "A [character](@)" @@ -747,22 +763,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - limited - to a certain - encoding. -- "A [line](@) is a" -- sequence of zero -- "or more [" +- "A [line](@)" +- is a sequence of +- "zero or more [" - "characters]" - other than line -- "feed (`U+000A`)" -- or carriage +- "feed (`U+000A`" +- ) or carriage - "return (`U+000D`" - "),\nfollowed by a" -- "[line ending] or" -- by the end of +- "[line ending]" +- or by the end of - file. -- "A [line ending](" -- "@) is a line" -- "feed (`U+000A`)," -- a carriage +- A +- "[line ending](@)" +- is a line feed ( +- "`U+000A`" +- "), a carriage" - "return\n(`U+000D`" - ) not followed - "by a line feed," @@ -775,28 +792,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters, or a" - line containing - "only spaces\n(" -- "`U+0020`) or" -- "tabs (`U+0009`)," -- "is called a [" -- "blank line](@)." +- "`U+0020`" +- ) or tabs ( +- "`U+0009`" +- "), is called a" +- "[blank line](@)." - The following - definitions of - character - classes will be - used in this - "spec:" -- "A [Unicode" +- A +- "[Unicode" - whitespace -- "character](@) is" -- a character in -- "the Unicode `Zs`" -- general +- "character](@)" +- is a character +- in the Unicode +- "`Zs` general" - "category, or a" -- "tab (`U+0009`)," -- "line feed (`U+" -- "000A`), form" -- "feed (`U+000C`)," -- or +- "tab (`U+0009`" +- "), line feed (" +- "`U+000A`" +- "), form feed (" +- "`U+000C`), or" - carriage return - "(`U+000D`)." - "[Unicode" @@ -806,44 +825,50 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Unicode - whitespace - "characters]." -- "A [tab](@) is `U" -- "+0009`." +- "A [tab](@) is" +- "`U+0009`." - "A [space](@) is" - "`U+0020`." -- "An [ASCII" -- control -- "character](@) is" -- a character -- "between `U+0000–" -- "1F` (both" -- "including) or `U" -- "+007F`." -- "An [ASCII" +- An +- "[ASCII control" +- "character](@)" +- is a character +- between +- "`U+0000–1F`" +- (both +- including) or +- "`U+007F`." +- An +- "[ASCII" - punctuation - "character](@)\nis" - "`!`, `\"`, `#`," -- "`$`, `%`, `&`, `" -- "'`, `(`, `)`," -- "`*`, `+`, `,`, `" -- "-`, `.`, `/`" +- "`$`, `%`, `&`," +- "`'`, `(`, `)`," +- "`*`, `+`, `,`," +- "`-`, `.`, `/`" - "(U+0021–2F)," -- "`:`, `;`, `<`, `" -- "=`, `>`, `?`," -- "`@` (U+003A–0040" -- "),\n`[`, `\\`, `]`" -- ", `^`, `_`, `` `" -- "`` (U+005B–0060)" -- ", \n`{`, `|`, `}`" -- ", or `~` (U+007B" -- –007E). -- "A [Unicode" +- "`:`, `;`, `<`," +- "`=`, `>`, `?`," +- "`@`" +- "(U+003A–0040)," +- "`[`, `\\`, `]`," +- "`^`, `_`," +- "`` ` ``" +- "(U+005B–0060)," +- "`{`, `|`, `}`" +- ", or `~`" +- (U+007B–007E). +- A +- "[Unicode" - punctuation -- "character](@) is" -- a character in -- "the Unicode `P`" +- "character](@)" +- is a character +- in the Unicode +- "`P`" - (puncuation) or -- "`S` (symbol)" -- general +- "`S`" +- (symbol) general - categories. - "## Tabs" - Tabs in lines @@ -945,8 +970,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - part of the - content. - In the following -- "case `>` is" -- followed by a +- "case `>`" +- is followed by a - "tab," - which is treated - as if it were @@ -1022,19 +1047,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*→*→*→\n.\n
    " - "````````````````" - "````````````````" -- "## Insecure" +- "##" +- Insecure - characters - For security - "reasons, the" - Unicode -- "character `U+" -- "0000` must be" -- replaced +- character +- "`U+0000`" +- must be replaced - with the - REPLACEMENT -- "CHARACTER (`U+" -- "FFFD`)." -- "## Backslash" +- CHARACTER ( +- "`U+FFFD`)." +- "##" +- Backslash - escapes - Any ASCII - punctuation @@ -1128,8 +1155,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - A backslash at - the end of the -- "line is a [hard" -- "line break]:" +- "line is a [" +- "hard line break]" +- ":" - "````````````````" - "````````````````" - example @@ -1197,8 +1225,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - including URLs - "and link titles," - "link references," -- "and [info" -- "strings] in [" +- "and [" +- "info strings] in" +- "[" - fenced code - "blocks]:" - "````````````````" @@ -1233,7 +1262,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- "## Entity and" +- "##" +- Entity and - numeric - character - references @@ -1251,13 +1281,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - with the - following - "exceptions:" -- "- Entity and" +- "-" +- Entity and - character - references are - not recognized - in code - blocks and code -- "spans.\n\n-" +- spans. +- "-" - Entity and - character - references @@ -1274,9 +1306,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - can be used - in place of a - "literal `*`" -- "character, `*" -- ";` cannot" -- "replace\n `*`" +- "character," +- "`*`" +- cannot replace +- "`*`" - in emphasis - "delimiters," - bullet list @@ -1300,11 +1333,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "." - "[Entity" - "references](@)" -- "consist of `&` +" -- any of the valid +- "consist of `&`" +- + any of the +- valid - HTML5 entity -- "names + `;`" -- ". The\ndocument" +- "names + `;`. The" +- document - "" - "````````````````" - "````````````````" -- "[Decimal numeric" +- "[" +- Decimal numeric - character - "references](@)" - "consist of `&#`" -- + a string of 1- -- "-7 arabic digits" -- "+ `;`" -- ". A" +- + a string of 1 +- "--" +- 7 arabic digits +- "+ `;`. A" - numeric - character - reference is @@ -1353,15 +1388,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - code points will - be replaced by - the REPLACEMENT -- "CHARACTER (`U+" -- "FFFD`" +- CHARACTER ( +- "`U+FFFD`" - ). - For security - "reasons," -- "the code point `" -- "U+0000` will" -- also be replaced -- "by `U+FFFD`." +- the code point +- "`U+0000`" +- will also be +- replaced by +- "`U+FFFD`." - "````````````````" - "````````````````" - example @@ -1370,17 +1406,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    # Ӓ Ϡ �

    " - "````````````````" - "````````````````" -- "[Hexadecimal" +- "[" +- Hexadecimal - numeric - character - "references](@)" - "consist of `&#`" -- "+\neither `X` or" -- "`x` + a string" -- of 1-6 -- hexadecimal -- "digits + `;`" -- "." +- " +\neither `X` or" +- "`x`" +- + a string of 1- +- 6 hexadecimal +- "digits + `;`." - They too are - parsed as the - corresponding @@ -1428,8 +1464,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - without a - trailing - semicolon (such -- "as `©`)," -- these are not +- "as `©`" +- "), these are not" - "recognized here," - because it makes - the grammar too @@ -1468,10 +1504,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - code spans or - "code blocks," - "including\nURLs," -- "[link titles]," -- "and [fenced code" -- "block][] [info" -- "strings]:" +- "[link titles]" +- ", and [" +- fenced code +- "block][] [" +- "info strings]:" - "````````````````" - "````````````````" - example @@ -1599,7 +1636,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ;tit")

    - "````````````````" - "````````````````" -- "# Blocks and" +- "#" +- Blocks and - inlines - We can think of - a document as a @@ -1622,9 +1660,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - like - headings and - paragraphs) -- "contain [inline]" -- (@) content--- -- "text," +- contain +- "[inline](@)" +- "content---text," - "links," - "emphasized text," - "images, code" @@ -1698,9 +1736,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - affect the - inline parsing - of any other. -- "## Container" -- blocks and leaf -- blocks +- "##" +- Container blocks +- and leaf blocks - We can divide - blocks into two - "types:" @@ -1710,8 +1748,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ")," - which can - contain other -- "blocks, and [" -- "leaf blocks](#" +- "blocks, and" +- "[leaf blocks](#" - "leaf-blocks)," - which cannot. - "# Leaf blocks" @@ -1722,8 +1760,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that make up a - Markdown - document. -- "## Thematic" -- breaks +- "##" +- Thematic breaks - A line - consisting of - optionally up to @@ -1732,8 +1770,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - followed by a - sequence of - three or more -- "matching `-`, `_" -- "`, or `*`" +- "matching `-`," +- "`_`, or `*`" - "characters, each" - followed - optionally by @@ -1918,7 +1956,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - interpreted as - the underline of - "a [setext" -- "heading], the" +- "heading]" +- ", the" - interpretation - "as a\n[" - "setext heading]" @@ -1976,8 +2015,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## ATX headings" -- "An [ATX heading]" -- (@) +- An +- "[ATX heading](@)" - consists of a - string of - "characters," @@ -1985,8 +2024,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "content, between" - an - opening sequence -- of 1--6 -- "unescaped `#`" +- of 1-- +- "6 unescaped `#`" - characters and - an optional - closing sequence @@ -2003,7 +2042,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - line. - The optional - closing sequence -- "of `#`s must be" +- "of `#`" +- s must be - preceded by - spaces or tabs - and may be @@ -2028,8 +2068,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The heading - level is equal - "to the number\nof" -- "`#` characters" -- in the opening +- "`#`" +- characters in +- the opening - sequence. - "Simple headings:" - "````````````````" @@ -2047,8 +2088,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    foo
    " - "````````````````" - "````````````````" -- "More than six `#" -- "` characters is" +- More than six +- "`#`" +- characters is - "not a heading:" - "````````````````" - "````````````````" @@ -2099,8 +2141,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - This is not a - "heading, because" -- "the first `#` is" -- "escaped:" +- "the first `#`" +- "is escaped:" - "````````````````" - "````````````````" - example @@ -2207,12 +2249,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ".\n

    foo

    " - "````````````````" - "````````````````" -- "A sequence of `#" -- "` characters" -- with anything -- but spaces or -- tabs following -- it +- A sequence of +- "`#`" +- characters with +- anything but +- spaces or tabs +- following it - is not a closing - "sequence, but" - counts as part @@ -2293,10 +2335,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- "## Setext" -- headings -- "A [setext" -- "heading](@)" +- "##" +- Setext headings +- A +- "[setext heading]" +- (@) - consists of one - or more - "lines of text," @@ -2336,15 +2379,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "list items], or" - "[HTML block][" - "HTML blocks]." -- "A [setext" -- heading -- "underline](@) is" -- a sequence of -- "`=` characters" -- or a sequence of -- "`-` characters," -- with no more -- than 3 +- A +- "[setext heading" +- "underline](@)" +- is a sequence of +- "`=`" +- characters or a +- "sequence of `-`" +- "characters, with" +- no more than 3 - spaces of - indentation and - any number of @@ -2356,8 +2399,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - characters are - "used in\nthe [" - setext heading -- "underline], and" -- a level 2 +- "underline]" +- ", and a level 2" - "heading if `-`" - characters are - used. @@ -2418,8 +2461,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "headings's raw" - content as - inlines. -- "The heading's" -- raw content is +- "The heading'" +- s raw content is - formed by - concatenating - the lines and @@ -2572,11 +2615,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The setext - heading - underline cannot -- "be a [lazy" +- "be a [" +- lazy - continuation -- "line] in a list" -- item or block -- "quote:" +- "line]" +- in a list item +- "or block quote:" - "````````````````" - "````````````````" - example @@ -2613,8 +2657,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - since otherwise - the paragraph - becomes part -- "of the heading's" -- "content:" +- "of the heading'" +- "s content:" - "````````````````" - "````````````````" - example @@ -2695,10 +2739,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - If you want a -- "heading with `>" -- "foo` as its" -- "literal text," -- you can +- heading with +- "`> foo`" +- as its literal +- "text, you can" - use backslash - "escapes:" - "````````````````" @@ -2710,8 +2754,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "**Compatibility" -- "note:** Most" -- existing +- "note:**" +- Most existing - Markdown - implementations - do not allow the @@ -2721,8 +2765,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - But there is no - consensus about - how to interpret -- "``` markdown\nFoo" -- "bar\n---\nbaz\n```" +- "``` markdown" +- "Foo\nbar\n---\nbaz" +- "```" - One can find - four different - "interpretations:" @@ -2731,9 +2776,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "bar\", paragraph" - "\"baz\"" - "2. paragraph \"" -- "Foo bar\"," -- "thematic break," -- "paragraph \"baz\"" +- "Foo bar\"" +- ", thematic break" +- ", paragraph \"baz" +- "\"" - "3. paragraph \"" - "Foo bar --- baz\"" - "4. heading \"Foo" @@ -2785,8 +2831,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that cannot - "count as a [" - setext heading -- "underline], such" -- as +- "underline]" +- ", such as" - "````````````````" - "````````````````" - example @@ -2809,19 +2855,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - baz

    - "````````````````" - "````````````````" -- "## Indented code" +- "##" +- Indented code - blocks -- "An [indented" -- "code block](@)" +- An +- "[indented code" +- "block](@)" - is composed of - "one or more\n[" - "indented chunks]" - separated by - "blank lines.\nAn" - "[indented chunk]" -- (@) is a -- sequence of non- -- "blank lines," +- (@) +- is a sequence of +- "non-blank lines," - each preceded by - four or more - spaces of @@ -2833,8 +2881,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "of the lines," - including - "trailing\n[" -- "line endings]," -- minus four +- "line endings]" +- ", minus four" - spaces of - indentation. - An indented code @@ -2880,8 +2928,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - indicating that - material belongs - "to a [list\nitem]" -- "[list items]," -- the list item +- "[list items]" +- ", the list item" - interpretation - takes precedence - ":" @@ -3052,8 +3100,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Trailing spaces - or tabs are - included in the -- "code block's" -- "content:" +- "code block'" +- "s content:" - "````````````````" - "````````````````" - example @@ -3062,17 +3110,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- "## Fenced code" +- "##" +- Fenced code - blocks -- "A [code fence](@" -- ) is a sequence +- A +- "[code fence](@)" +- is a sequence - of at least - three - consecutive - backtick -- "characters (`` `" -- "``) or\ntildes (" -- "`~`" +- characters ( +- "`` ` ``) or" +- "tildes (`~`" - ). - (Tildes and - backticks cannot @@ -3096,8 +3146,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - leading and - trailing - spaces or tabs -- "and called the [" -- "info string](@)" +- and called the +- "[info string](@)" - ". If the [" - "info string]" - comes @@ -3123,10 +3173,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - consists of all - subsequent lines - ", until" -- "a closing [code" -- "fence] of the" -- same type as the -- code block +- "a closing [" +- "code fence]" +- of the same type +- as the code +- block - began with ( - backticks or - "tildes), and" @@ -3220,10 +3271,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - not parsed - as inlines. - The first word -- "of the [info" -- "string] is" -- typically used -- to +- "of the [" +- "info string]" +- is typically +- used to - specify the - language of the - "code sample, and" @@ -3325,11 +3376,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - end of the - document - (or the -- "enclosing [block" -- "quote][block" -- "quotes] or [list" -- "item][list items" -- "]):" +- "enclosing [" +- "block quote][" +- "block quotes] or" +- "[list item][" +- "list items]):" - "````````````````" - "````````````````" - example @@ -3543,8 +3594,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - opening code - fence. - Although this -- "spec doesn't" -- mandate any +- "spec doesn'" +- t mandate any - particular - treatment of - "the info string," @@ -3634,8 +3685,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - Closing code - fences cannot -- "have [info" -- "strings]:" +- "have [" +- "info strings]:" - "````````````````" - "````````````````" - example @@ -3647,8 +3698,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## HTML blocks" -- "An [HTML block](" -- "@) is a group of" +- An +- "[HTML block](@)" +- is a group of - lines that is - treated - as raw HTML (and @@ -3656,26 +3708,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - escaped in HTML - output). - There are seven -- "kinds of [HTML" -- "block], which" -- can be defined -- by their +- "kinds of [" +- "HTML block]" +- ", which can be" +- defined by their - start and end - conditions. - The block begins - with a line that - meets a - "[start condition" -- "](@) (after up" -- to three -- optional spaces -- of indentation). +- "](@)" +- (after up to +- three optional +- spaces of +- indentation). - It ends with the - first subsequent - line that meets - a matching - "[end condition](" -- "@), or the last" +- "@)" +- ", or the last" - line of the - "document, or the" - "last line of\nthe" @@ -3687,50 +3741,56 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "block, if no" - line is - encountered that -- "meets the [end" -- "condition]. If" +- "meets the [" +- "end condition]" +- ". If" - the first line - "meets both the [" - "start condition]" - "and the [end" -- "condition], the" -- block will +- "condition]" +- ", the block will" - contain just -- "that line.\n\n1." +- that line. +- "1." - "**Start" - "condition:**" - line begins with -- "the string ``, or" -- the end of the -- "line.\\" +- "string `>`" +- ", or the end of" +- "the line.\\" - "**End condition:" -- "** line" -- contains an end -- "tag\n`
    `," -- "``, ``, or `` (case" -- "-insensitive; it" +- "**" +- line contains an +- "end tag\n`
    `" +- ", ``," +- "``, or" +- "``" +- (case- +- insensitive; it - need not match - the start tag). - "2." - "**Start" - "condition:**" - line begins with -- "the string ``." +- "**" +- line contains +- "the string `-->`" +- "." - "3." - "**Start" - "condition:**" @@ -3738,7 +3798,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the string ``." - "4." - "**Start" @@ -3748,9 +3809,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - followed by an - "ASCII letter.\\" - "**End condition:" -- "** line contains" -- "the character `>" -- "`." +- "**" +- line contains +- the character +- "`>`." - "5." - "**Start" - "condition:**" @@ -3758,7 +3820,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the string - "``" - "." - "6." @@ -3771,65 +3834,71 @@ input_file: tests/inputs/markdown/commonmark_spec.md - of the strings ( - case-insensitive - ") `address`," -- "`article`, `" -- "aside`, `base`," -- "`basefont`, `" -- "blockquote`, `" -- "body`,\n`caption`" -- ", `center`, `col" -- "`, `colgroup`, `" -- "dd`, `details`," +- "`article`," +- "`aside`, `base`," +- "`basefont`," +- "`blockquote`," +- "`body`," +- "`caption`," +- "`center`, `col`," +- "`colgroup`, `dd`" +- ", `details`," - "`dialog`,\n`dir`," -- "`div`, `dl`, `dt" -- "`, `fieldset`, `" -- "figcaption`, `" -- "figure`," +- "`div`, `dl`," +- "`dt`, `fieldset`" +- ", `figcaption`," +- "`figure`," - "`footer`, `form`" -- ", `frame`, `" -- "frameset`,\n`h1`," -- "`h2`, `h3`, `h4`" -- ", `h5`, `h6`, `" -- "head`, `header`," -- "`hr`,\n`html`," -- "`iframe`, `" -- "legend`, `li`, `" -- "link`, `main`, `" -- "menu`, `menuitem" -- "`,\n`nav`," +- ", `frame`," +- "`frameset`,\n`h1`" +- ", `h2`, `h3`," +- "`h4`, `h5`, `h6`" +- ", `head`," +- "`header`, `hr`," +- "`html`, `iframe`" +- ", `legend`, `li`" +- ", `link`, `main`" +- ", `menu`," +- "`menuitem`," +- "`nav`," - "`noframes`, `ol`" -- ", `optgroup`, `" -- "option`, `p`, `" -- "param`,\n`search`" -- ", `section`, `" -- "summary`, `table" -- "`, `tbody`, `td`" -- ",\n`tfoot`, `th`," -- "`thead`, `title`" -- ", `tr`, `track`," -- "`ul`" +- ", `optgroup`," +- "`option`, `p`," +- "`param`," +- "`search`," +- "`section`," +- "`summary`," +- "`table`, `tbody`" +- ", `td`,\n`tfoot`," +- "`th`, `thead`," +- "`title`, `tr`," +- "`track`, `ul`" - ", followed" - "by a space, a" - "tab, the end of" - "the line, the" -- "string `>`" -- ", or\nthe string" -- "`/>`.\\" +- "string `>`, or" +- "the string `/>`." +- "\\" - "**End condition:" -- "** line is" -- "followed by a [" -- "blank line]." +- "**" +- line is followed +- "by a [blank line" +- "]." - "7." - "**Start" - "condition:**" - line begins with -- "a complete [open" -- "tag]\n(with any [" -- "tag name] other" -- "than `pre`, `" -- "script`,\n`style`" -- ", or `textarea`)" -- "or a complete [" -- "closing tag]," +- "a complete [" +- "open tag]" +- "(with any [" +- "tag name]" +- "other than `pre`" +- ", `script`," +- "`style`, or" +- "`textarea`" +- ) or a complete +- "[closing tag]," - followed by zero - or more spaces - "and tabs," @@ -3837,25 +3906,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - end of the line. - "\\" - "**End condition:" -- "** line is" -- "followed by a [" -- "blank line]." +- "**" +- line is followed +- "by a [blank line" +- "]." - HTML blocks - continue until - they are closed - by their - "appropriate\n[" -- "end condition]," -- or the last line -- of the document -- "or other [" -- "container\nblock" +- "end condition]" +- ", or the last" +- line of the +- document or +- other +- "[container\nblock" - "](#container-" - blocks) - "." - This means any -- HTML **within an -- "HTML\nblock**" +- HTML +- "**within an HTML" +- block** - that might - otherwise be - recognised as a @@ -3866,14 +3938,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - passed through - "as-is, without" - changing -- "the parser's" -- state. -- "For instance, `<" -- "pre>` within an" -- HTML block -- "started by `<" -- "table>` will not" -- affect +- "the parser'" +- s state. +- "For instance," +- "`
    `"
    +- within an HTML
    +- block started by
    +- "``"
    +- will not affect
     - the parser state
     - ; as the HTML
     - block was
    @@ -3910,10 +3982,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - regular parsing
     - "resumes, with a"
     - "paragraph,"
    -- "emphasised `"
    -- "world` and"
    -- inline and block
    -- HTML following.
    +- emphasised
    +- "`world`"
    +- and inline and
    +- block HTML
    +- following.
     - "All types of ["
     - "HTML blocks]"
     - except type 7
    @@ -4057,9 +4130,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````"
     - The initial tag
    -- "doesn't even"
    -- need to be a
    -- valid
    +- "doesn'"
    +- t even need to
    +- be a valid
     - "tag, as long as"
     - it starts like
     - "one:"
    @@ -4133,9 +4206,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````"
     - "To start an ["
    -- "HTML block] with"
    -- a tag that is *
    -- not* in the
    +- "HTML block]"
    +- with a tag that
    +- is *not* in the
     - list of block-
     - level tags in (6
     - "), you must put"
    @@ -4189,20 +4262,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - either block-
     - level or inline-
     - "level tags.\nThe"
    -- "`` tag is a"
    -- nice example.
    +- "``"
    +- tag is a nice
    +- example.
     - We can surround
     - content with
    -- "`` tags in"
    -- three different
    -- ways.
    +- "``"
    +- tags in three
    +- different ways.
     - "In this case, we"
     - get a raw
     - "HTML block,"
    -- "because the `<"
    -- "del>` tag is on"
    -- a line by itself
    -- ":"
    +- because the
    +- "``"
    +- tag is on a line
    +- "by itself:"
     - "````````````````"
     - "````````````````"
     - example
    @@ -4215,9 +4289,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - get a raw HTML
     - block that just
     - "includes\nthe"
    -- "`` tag ("
    -- because it ends
    -- with the
    +- "``"
    +- tag (because it
    +- ends with the
     - following blank
     - line).
     - So the contents
    @@ -4233,12 +4307,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````"
     - "Finally, in this"
    -- "case, the `"
    -- "` tags are"
    +- "case, the"
    +- "``"
    +- tags are
     - "interpreted\nas ["
    -- "raw HTML] *"
    -- inside* the
    -- CommonMark
    +- "raw HTML]"
    +- "*inside*"
    +- the CommonMark
     - paragraph.
     - (Because
     - the tag is not
    @@ -4260,9 +4335,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - designed to
     - contain literal
     - "content\n(`pre`,"
    -- "`script`, `style"
    -- "`, `textarea`),"
    -- "comments,"
    +- "`script`,"
    +- "`style`,"
    +- "`textarea`"
    +- "), comments,"
     - processing
     - "instructions,"
     - and declarations
    @@ -4372,11 +4448,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - end at the
     - end of the
     - document (or the
    -- "enclosing [block"
    -- "quote][block"
    -- "quotes]\nor ["
    -- "list item][list"
    -- "items]):"
    +- "enclosing ["
    +- "block quote]["
    +- "block quotes]\nor"
    +- "[list item]["
    +- "list items]):"
     - "````````````````"
     - "````````````````"
     - example
    @@ -4542,9 +4618,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````"
     - An HTML block of
    -- types 1--6 can
    -- interrupt a
    -- "paragraph, and"
    +- types 1--
    +- 6 can interrupt
    +- "a paragraph, and"
     - need not be
     - preceded by a
     - blank line.
    @@ -4591,19 +4667,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - This rule
     - differs from
    -- "John Gruber's"
    -- original
    +- "John Gruber'"
    +- s original
     - Markdown syntax
     - "specification,"
    -- "which says:\n\n>"
    +- "which says:"
    +- ">"
     - The only
     - restrictions are
     - that block-level
     - HTML elements —
     - "> e.g. `
    `," -- "`
    `, ``, `

    `, etc." -- — must be +- "`

    `," +- "`
    `, `

    `" +- ", etc. — must be" - "separated from\n>" - surrounding - content by blank @@ -4614,20 +4691,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - be indented with - spaces or tabs. - In some ways -- "Gruber's rule is" -- more restrictive -- than the one -- "given\nhere:" -- "- It requires" -- that an HTML -- block be +- "Gruber'" +- s rule is more +- restrictive than +- the one given +- "here:" +- "-" +- It requires that +- an HTML block be - preceded by a - blank line. -- "- It does not" +- "-" +- It does not - allow the start - tag to be - indented. -- "- It requires a" +- "-" +- It requires a - matching end tag - ", which it also" - does not allow @@ -4636,16 +4716,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Most Markdown - implementations - (including some -- "of Gruber's own)" -- do not +- "of Gruber'" +- s own) do not - respect all of - these - restrictions. - There is one - "respect, however" - ", in which" -- "Gruber's rule is" -- more liberal +- "Gruber'" +- s rule is more +- liberal - than the one - "given here," - since it allows @@ -4713,8 +4794,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - content inside - tags as text if - the open tag has -- "the attribute `" -- "markdown=1`" +- the attribute +- "`markdown=1`" - "." - The rule given - above seems a @@ -4795,51 +4876,56 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "inside `

    `"
     - "tags, but as"
     - "described\n[above"
    -- "][HTML blocks],"
    -- raw HTML blocks
    -- "starting with `<"
    -- "pre>`\n*can*"
    +- "][HTML blocks]"
    +- ", raw HTML"
    +- blocks starting
    +- "with `
    `"
    +- "*can*"
     - contain blank
     - lines.
    -- "## Link"
    -- reference
    +- "##"
    +- Link reference
     - definitions
    -- "A [link"
    -- reference
    +- A
    +- "[link reference"
     - "definition](@)"
     - "consists of a ["
    -- "link label],"
    -- optionally
    +- "link label]"
    +- ", optionally"
     - preceded by up
     - to three spaces
     - of
     - "indentation,"
     - followed
    -- "by a colon (`:`)"
    -- ", optional"
    +- "by a colon (`:`"
    +- "), optional"
     - spaces or tabs (
     - including up to
     - "one\n[line ending"
    -- "]), a [link"
    -- "destination],"
    +- "]), a ["
    +- link destination
    +- "],"
     - optional spaces
     - or tabs (
     - including up to
     - "one\n[line ending"
    -- "]), and an"
    +- "]"
    +- "), and an"
     - "optional [link"
    -- "title], which if"
    -- it is present
    -- must be
    +- "title]"
    +- ", which if it is"
    +- present must be
     - separated
    -- "from the [link"
    -- "destination] by"
    -- spaces or tabs.
    +- "from the ["
    +- link destination
    +- "]"
    +- by spaces or
    +- tabs.
     - No further
     - character may
     - occur.
    -- "A [link"
    -- reference
    +- "A ["
    +- link reference
     - "definition]"
     - does not
     - correspond to a
    @@ -4856,8 +4942,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - elsewhere in the
     - "document.  [Link"
     - reference
    -- "definitions] can"
    -- come either
    +- "definitions]"
    +- can come either
     - before or after
     - the links that
     - "use\nthem."
    @@ -5045,8 +5131,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - As noted in the
     - "section on ["
    -- "Links], matching"
    -- of labels is
    +- "Links]"
    +- ", matching of"
    +- labels is
     - case-insensitive
     - "(see [matches])."
     - "````````````````"
    @@ -5169,8 +5256,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "

    [foo]

    " - "````````````````" - "````````````````" -- "A [link" -- reference +- "A [" +- link reference - "definition]" - cannot interrupt - a paragraph. @@ -5225,8 +5312,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo

    - "````````````````" - "````````````````" -- "Several [link" -- reference +- "Several [" +- link reference - "definitions]" - can occur one - "after another," @@ -5253,9 +5340,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url\">baz

    " - "````````````````" - "````````````````" -- "[Link reference" -- "definitions] can" -- occur +- "[" +- Link reference +- "definitions]" +- can occur - inside block - "containers, like" - lists and block @@ -5285,17 +5373,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - interpreted as - other - kinds of blocks -- "forms a [" -- "paragraph](@)." +- forms a +- "[paragraph](@)." - The contents of - the paragraph - are the result - of parsing the -- "paragraph's raw" -- content as +- "paragraph'" +- s raw content as - inlines. -- "The paragraph's" -- raw content +- "The paragraph'" +- s raw content - is formed by - concatenating - the lines and @@ -5398,8 +5486,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that ends with - two or more - spaces will not -- "end with a [hard" -- "line\nbreak]:" +- "end with a [" +- "hard line\nbreak]" +- ":" - "````````````````" - "````````````````" - example @@ -5433,13 +5522,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    aaa

    " - "````````````````" - "````````````````" -- "# Container" -- blocks -- "A [container" -- "block](#" -- container-blocks -- ) is a block -- that has other +- "#" +- Container blocks +- A +- "[container block" +- "](#container-" +- blocks) +- is a block that +- has other - blocks as its - contents. - There are two @@ -5458,7 +5548,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The general - form of the - "definition is:" -- "> If X is a" +- ">" +- If X is a - sequence of - "blocks, then the" - "result of\n>" @@ -5476,15 +5567,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - list item by - explaining - how these can be -- "*generated* from" -- their contents. +- "*generated*" +- from their +- contents. - This should - suffice - to define the - "syntax, although" - it does not give -- a recipe for * -- parsing* +- a recipe for +- "*parsing*" - these - constructions. - (A recipe is @@ -5497,17 +5589,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - parsing-strategy - ).) - "## Block quotes" -- "A [block quote" +- A +- "[block quote" - "marker](@)," - optionally - preceded by up - to three spaces - "of indentation," - consists of (a) -- "the character `>" -- "` together with" -- a following -- space of +- the character +- "`>`" +- together with a +- following space +- of - "indentation, or" - (b) a single - "character `>`" @@ -5523,14 +5617,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - lines *Ls* - constitute a - sequence -- "of blocks *Bs*," -- then the result -- of prepending a -- "[block quote" -- "marker] to the" -- beginning of -- each line in *Ls -- "*\n is a" +- of blocks *Bs* +- ", then the" +- result of +- "prepending a [" +- block quote +- "marker]" +- to the beginning +- of each line in +- "*Ls*\n is a" - "[block quote](#" - block-quotes) - containing *Bs*. @@ -5538,28 +5633,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "**Laziness.**" - If a string of - lines *Ls* -- "constitute a [" -- "block\n quote" +- constitute a +- "[block\n quote" - "](#block-quotes)" -- with contents * -- "Bs*, then the" +- with contents +- "*Bs*" +- ", then the" - result of - deleting - "the initial [" - block quote -- "marker] from one" -- or +- "marker]" +- from one or - more lines in - which the next - character other - than a space or - tab after the -- "[block quote" +- "[" +- block quote - "marker] is [" - paragraph - continuation -- "text] is a block" -- quote with *Bs* +- "text]" +- is a block quote +- with *Bs* - as its content. - "[Paragraph" - continuation @@ -5571,19 +5669,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - but does - not occur at the - beginning of the -- "paragraph.\n\n3." +- paragraph. +- "3." - "**" - Consecutiveness. -- "** A document" +- "**" +- A document - cannot contain - "two [block" -- "quotes] in a row" -- unless there is -- "a [blank line]" +- "quotes]" +- in a row unless +- "there is a [" +- "blank line]" - between them. - Nothing else -- "counts as a [" -- "block quote](#" +- counts as a +- "[block quote](#" - block-quotes). - Here is a simple - "example:" @@ -5648,7 +5749,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The Laziness - clause allows us - "to omit the `>`" -- "before\n[" +- " before\n[" - paragraph - continuation - "text]:" @@ -5690,7 +5791,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - block quote - "markers]." - "For example, the" -- "`> ` cannot be" +- "`> `" +- cannot be - omitted in the - second line of - "``` markdown" @@ -5708,9 +5810,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Similarly, if we" -- "omit the `> ` in" -- the second line -- of +- "omit the `> `" +- in the second +- line of - "``` markdown" - "> - foo\n> - bar" - "```" @@ -5732,9 +5834,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - For the same -- "reason, we can't" -- "omit the `> ` in" -- front of +- "reason, we can'" +- "t omit the `> `" +- in front of - subsequent lines - of an indented - or fenced code @@ -5784,10 +5886,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```markdown" - "> foo" - "> - bar\n```" -- "the `- bar` is" -- indented too far -- "to start a list," -- "and can't" +- "the `- bar`" +- is indented too +- far to start a +- "list, and can't" - be an indented - code block - because indented @@ -5795,7 +5897,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - cannot - interrupt - "paragraphs, so" -- "it is [paragraph" +- "it is [" +- paragraph - continuation - "text]." - A block quote @@ -5848,11 +5951,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "implementations," - including John - "Gruber's" -- "original `" -- "Markdown.pl`," -- will parse this -- example as a -- single block +- original +- "`Markdown.pl`" +- ", will parse" +- this example as +- a single block - quote - with two - paragraphs. @@ -5963,8 +6066,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the Laziness - rule that any - number -- "of initial `>`s" -- may be omitted +- "of initial `>`" +- s may be omitted - on a - continuation - line of a @@ -6003,10 +6106,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - block in a block - "quote," - remember that -- "the [block quote" +- "the [" +- block quote - "marker] includes" -- "both the `>` and" -- a following +- "both the `>`" +- and a following - space of - indentation. So - "*five spaces*" @@ -6027,25 +6131,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## List items" -- "A [list marker](" -- "@) is a\n[" +- A +- "[list marker](@)" +- " is a\n[" - bullet list - "marker] or an [" - ordered list - "marker]." -- "A [bullet list" +- A +- "[bullet list" - "marker](@)\nis a" - "`-`, `+`, or `*`" - character. -- "An [ordered list" +- An +- "[ordered list" - "marker](@)" - is a sequence of -- 1--9 arabic -- "digits (`0-9`)," -- followed by +- 1-- +- 9 arabic digits +- "(`0-9`" +- "), followed by" - "either a\n`.`" -- "character or a `" -- ")`" +- character or a +- "`)`" - character. - (The reason for - the length @@ -6069,26 +6177,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - starting with a - character other - than a space or -- "tab, and *M*" -- is +- "tab, and *M* is" - a list marker of - width *W* - followed by 1 ≤ -- "*N* ≤ 4 spaces" -- "of indentation," +- "*N*" +- ≤ 4 spaces of +- "indentation," - then the result -- of prepending *M -- "* and the" +- of prepending +- "*M*" +- and the - following spaces - to the first - "line\n of *Ls*" - ", and indenting" - subsequent lines -- of *Ls* by *W + -- "N* spaces, is a" -- list item with * -- Bs* as its -- contents. +- of *Ls* by +- "*W + N*" +- "spaces, is a" +- list item with +- "*Bs*" +- as its contents. - The type of the - list item - (bullet or @@ -6113,7 +6223,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - starts on a line - that would - otherwise count -- "as [paragraph" +- "as [" +- paragraph - continuation - "text]---then (a)" - the lines *Ls* @@ -6132,8 +6243,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that line is not - a list item. - "For example, let" -- "*Ls* be the" -- lines +- "*Ls*" +- be the lines - "````````````````" - "````````````````" - example @@ -6153,9 +6264,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- And let *M* be -- "the marker `1.`," -- and *N* +- And let *M* +- be the marker +- "`1.`, and *N*" - "= 2." - "Then rule #1" - says @@ -6317,8 +6428,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - occurs in the - same column as - the list marker -- "`1.`" -- "," +- "`1.`," - but is actually - contained in the - "list item," @@ -6337,8 +6447,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - occurs far to - the right of the - initial text of -- "the list item, `" -- "one`, but" +- "the list item," +- "`one`, but" - it is not - considered part - of the list item @@ -6508,18 +6618,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - one space of - "indentation," - then the result -- of prepending *M -- "* and the" +- of prepending +- "*M* and the" - following space - to the first -- "line of *Ls*," -- and indenting +- line of *Ls* +- ", and indenting" - subsequent lines -- of *Ls* by *W + -- "1* spaces, is a" -- list item with * -- Bs* as its -- contents. +- of *Ls* by +- "*W + 1*" +- "spaces, is a" +- list item with +- "*Bs*" +- as its contents. - If a line is - "empty, then it" - need not be @@ -6586,8 +6697,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "then by rule #2," - the contents - must be preceded -- by *one* space -- of indentation +- by *one* +- space of +- indentation - after the list - "marker:" - "````````````````" @@ -6731,30 +6843,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - If a sequence of - lines *Ls* - starting with a -- "single [blank" -- "line] constitute" -- a (possibly -- empty) +- "single [" +- "blank line]" +- constitute a ( +- possibly empty) - sequence of - "blocks *Bs*, and" -- "*M* is a list" -- marker of width -- "*W*" -- "," +- "*M*" +- is a list marker +- "of width *W*," - then the result -- of prepending *M -- "* to the first" -- "line of *Ls*," -- and +- of prepending +- "*M*" +- to the first +- line of *Ls* +- ", and" - preceding - subsequent lines -- of *Ls* by *W + -- 1* spaces of +- of *Ls* by +- "*W + 1*" +- spaces of - "indentation, is" - a -- list item with * -- Bs* as its -- contents. +- list item with +- "*Bs*" +- as its contents. - If a line is - "empty, then it" - need not be @@ -6803,8 +6916,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - spaces - following the - list marker -- "doesn't change" -- the required +- "doesn'" +- t change the +- required - "indentation:" - "````````````````" - "````````````````" @@ -7016,11 +7130,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "**Laziness.**" - If a string of - lines *Ls* -- "constitute a [" -- "list\n item" +- constitute a +- "[list\n item" - "](#list-items)" -- with contents * -- "Bs*, then the" +- with contents +- "*Bs*" +- ", then the" - result of - deleting - some or all of @@ -7033,7 +7148,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - space or tab - after the - indentation is -- "[paragraph" +- "[" +- paragraph - continuation - "text] is a" - list item with @@ -7130,9 +7246,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - not counted as a - list item by - "rules\n #1--" -- "5 counts as a [" -- "list item](#list" -- "-items)." +- 5 counts as a +- "[list item](#" +- list-items). - The rules for - sublists follow - from the general @@ -7257,12 +7373,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "### Motivation" -- "John Gruber's" -- Markdown spec +- "John Gruber'" +- s Markdown spec - says the - following about - "list items:" -- "1. \"" +- "1." +- "\"" - List markers - typically start - at the left @@ -7275,16 +7392,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - by one or more - spaces or a tab. - "\"" -- "2. \"" +- "2." +- "\"" - To make lists - "look nice, you" - can wrap items - with hanging - indents.... -- "But if you don't" -- "want to, you" +- "But if you don'" +- "t want to, you" - "don't have to.\"" -- "3. \"" +- "3." +- "\"" - List items may - consist of - multiple @@ -7295,7 +7414,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - be indented by - either 4 spaces - "or one\n tab.\"" -- "4. \"" +- "4." +- "\"" - It looks nice if - you indent every - line of the @@ -7305,7 +7425,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Markdown will - allow you to be - "lazy.\"" -- "5. \"" +- "5." +- "\"" - To put a - blockquote - within a list @@ -7313,7 +7434,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "blockquote's `>`" - delimiters need - "to be indented.\"" -- "6. \"" +- "6." +- "\"" - To put a code - block within a - "list item, the" @@ -7375,15 +7497,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "and principled," - and if the - reference -- "implementation `" -- "Markdown.pl` had" -- "followed it, it" -- probably would -- have +- implementation +- "`Markdown.pl`" +- "had followed it," +- it probably +- would have - become the - standard. -- "However, `" -- "Markdown.pl`" +- "However," +- "`Markdown.pl`" - allowed - paragraphs and - sublists to @@ -7419,7 +7541,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "python-Markdown," - "for example," - stuck with -- "Gruber's syntax" +- "Gruber'" +- s syntax - description and - the four-space - "rule, while" @@ -7428,8 +7551,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "marked, PHP" - "Markdown, and" - "others\nfollowed" -- "`Markdown.pl`'s" -- behavior more +- "`Markdown.pl`'" +- s behavior more - closely.) - "Unfortunately," - given the @@ -7454,8 +7577,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule - or - the more -- "forgiving `" -- "Markdown.pl`" +- forgiving +- "`Markdown.pl`" - "behavior," - provided they - are laid out @@ -7517,7 +7640,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - with an - intervening - "paragraph," -- "``` html\n
    ` will" -- " not affect\n" +- "For instance, `
    ` within an HTML block started by `
    `" +- " will not affect\n" - the parser state; as the HTML block was started in by start condition 6 - ", it\nwill end at any blank line. This can be surprising:\n\n" - "````````````````````````````````" @@ -1509,8 +1513,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n" - "````````````````````````````````\n" - "\n" -- "In this case, the HTML block is terminated by the blank line — the `" -- "**Hello**`\n" +- "In this case, the HTML block is terminated by the blank line — the " +- "`**Hello**`\n" - "text remains verbatim — and regular parsing resumes, with a paragraph,\n" - "emphasised `world` and inline and block HTML following.\n\n" - "All types of [HTML blocks] except type 7 may interrupt\n" @@ -1644,18 +1648,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````````````````````\n" - "\n\nIn this case, we get a raw HTML block that just includes\n" -- "the ``" -- " tag (because it ends with the following blank\n" +- "the `` tag (because it ends with the following blank\n" - "line). So the contents get interpreted as CommonMark:\n\n" - "````````````````````````````````" - " example\n" - "\n\n*foo*\n\n\n.\n\n" - "

    foo

    \n
    \n" - "````````````````````````````````\n" -- "\n\nFinally, in this case, the ``" -- " tags are interpreted\nas [raw HTML] *inside*" -- " the CommonMark paragraph. (Because\n" -- "the tag is not on a line by itself, we get inline HTML\n" +- "\n\nFinally, in this case, the `` tags are interpreted\n" +- "as [raw HTML] *inside* the CommonMark paragraph. (Because" +- "\nthe tag is not on a line by itself, we get inline HTML\n" - "rather than an [HTML block].)\n\n" - "````````````````````````````````" - " example\n" @@ -1663,8 +1665,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n" - "````````````````````````````````\n" - "\n\nHTML tags designed to contain literal content\n(`pre`, `script`, " -- "`style`, `textarea`" -- "), comments, processing instructions,\nand declarations are treated somewhat differently.\n" +- "`style`, `textarea`), comments, processing instructions,\n" +- "and declarations are treated somewhat differently.\n" - "Instead of ending at the first blank line, these blocks\n" - "end at the first line containing a corresponding end tag.\n" - "As a result, these blocks can contain blank lines:\n\n" @@ -1816,16 +1818,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \nbaz

    \n" - "````````````````````````````````\n" - "\n\nThis rule differs from John Gruber's original Markdown syntax\n" -- "specification, which says:" -- "\n\n> The only restrictions are that block-level HTML elements —\n> " -- "e.g. `
    `, ``, `
    `, `<"
    -- "p>`, etc. — must be separated from\n> "
    +- "specification, which says:\n\n"
    +- "> "
    +- "The only restrictions are that block-level HTML elements —\n> e.g. "
    +- "`
    `, `
    `, `
    `, `

    `" +- ", etc. — must be separated from\n> " - "surrounding content by blank lines, and the start and end tags of the" - "\n> block should not be indented with spaces or tabs.\n\n" - "In some ways Gruber's rule is more restrictive than the one given\n" - "here:\n\n" -- "- It requires that an HTML block be preceded by a blank line." -- "\n- It does not allow the start tag to be indented.\n" +- "- It requires that an HTML block be preceded by a blank line.\n" +- "- It does not allow the start tag to be indented.\n" - "- It requires a matching end tag, which it also does not allow to" - "\n be indented.\n\n" - "Most Markdown implementations (including some of Gruber's own) do not\n" @@ -1853,14 +1856,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nSome Markdown implementations have adopted a convention of\n" - "interpreting content inside tags as text if the open tag has\nthe attribute " -- "`markdown=1`" -- ". The rule given above seems a simpler and\n" +- "`markdown=1`. The rule given above seems a simpler and\n" - "more elegant way of achieving the same expressive power, which is also\n" -- much simpler to parse. -- "\n\nThe main potential drawback is that one can no longer paste HTML\n" +- "much simpler to parse.\n\n" +- "The main potential drawback is that one can no longer paste HTML\n" - "blocks into Markdown documents with 100% reliability. However,\n" -- "*in most cases*" -- " this will work fine, because the blank lines in\n" +- "*in most cases* this will work fine, because the blank lines in\n" - "HTML are usually followed by HTML block tags. For example:\n\n" - "````````````````````````````````" - " example\n" @@ -1869,8 +1870,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n

    \n" - "````````````````````````````````\n" - "\n\nThere are problems, however, if the inner tags are indented\n" -- "*and*" -- " separated by spaces, as then they will be interpreted as\n" +- "*and* separated by spaces, as then they will be interpreted as\n" - "an indented code block:\n\n" - "````````````````````````````````" - " example\n" @@ -1881,16 +1881,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````````````````````\n" - "\n\nFortunately, blank lines are usually not necessary and can be\n" -- "deleted. The exception is inside `
    `"
    -- " tags, but as described\n[above][HTML blocks]"
    -- ", raw HTML blocks starting with `
    `\n*can* contain blank lines.\n\n"
    +- "deleted.  The exception is inside `
    ` tags, but as described\n"
    +- "[above][HTML blocks], raw HTML blocks starting with `
    `\n"
    +- "*can* contain blank lines.\n\n"
     - "## Link reference definitions\n\n"
    -- "A [link reference definition](@)"
    -- "\nconsists of a [link label]"
    +- "A [link reference definition](@)\nconsists of a [link label]"
     - ", optionally preceded by up to three spaces of\nindentation, followed\n"
    -- "by a colon (`:`"
    -- "), optional spaces or tabs (including up to one\n[line ending]), a ["
    -- "link destination],\noptional spaces or tabs (including up to one\n[line ending]"
    +- "by a colon (`:`), optional spaces or tabs (including up to one\n"
    +- "[line ending]), a [link destination],\n"
    +- "optional spaces or tabs (including up to one\n[line ending]"
     - "), and an optional [link\ntitle]"
     - ", which if it is present must be separated\nfrom the [link destination]"
     - " by spaces or tabs.\nNo further character may occur.\n\n"
    @@ -2106,8 +2105,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "\n\n"
     - "## Paragraphs\n\n"
     - "A sequence of non-blank lines that cannot be interpreted as other\n"
    -- "kinds of blocks forms a [paragraph](@)"
    -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
    +- "kinds of blocks forms a [paragraph](@).\n"
    +- "The contents of the paragraph are the result of parsing the\nparagraph'"
     - "s raw content as inlines.  The paragraph's raw content\n"
     - "is formed by concatenating the lines and removing initial and final\n"
     - "spaces or tabs.\n\nA simple example with two paragraphs:\n"
    @@ -2182,18 +2181,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - " are meta-containers for [list items].\n\n"
     - "We define the syntax for container blocks recursively.  The general\n"
     - "form of the definition is:\n\n"
    -- "> If X is a sequence of blocks, then the result of\n> "
    +- "> "
    +- "If X is a sequence of blocks, then the result of\n> "
     - transforming X in such-and-such a way is a container of type Y
    -- "\n> with these blocks as its content."
    -- "\n\nSo, we explain what counts as a block quote or list item by explaining"
    -- "\nhow these can be *generated*"
    -- " from their contents. This should suffice\n"
    -- "to define the syntax, although it does not give a recipe for *parsing"
    -- "*\nthese constructions.  (A recipe is provided below in the section entitled\n"
    +- "\n> with these blocks as its content.\n\n"
    +- "So, we explain what counts as a block quote or list item by explaining\n"
    +- "how these can be *generated* from their contents. This should suffice\n"
    +- "to define the syntax, although it does not give a recipe for "
    +- "*parsing*\n"
    +- "these constructions.  (A recipe is provided below in the section entitled\n"
     - "[A parsing strategy](#appendix-a-parsing-strategy).)\n\n"
     - "## Block quotes\n\n"
    -- "A [block quote marker](@)"
    -- ",\noptionally preceded by up to three spaces of indentation,\n"
    +- "A [block quote marker](@),\n"
    +- "optionally preceded by up to three spaces of indentation,\n"
     - "consists of (a) the character `>` together with a following space of"
     - "\nindentation, or (b) a single character `>`"
     - " not followed by a space of\nindentation.\n\n"
    @@ -2207,21 +2207,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "2.  "
     - "**Laziness.**  If a string of lines *Ls*"
     - " constitute a [block\n    quote](#block-quotes) with contents "
    -- "*Bs*"
    -- ", then the result of deleting\n    the initial [block quote marker]"
    -- " from one or\n    "
    +- "*Bs*, then the result of deleting\n    the initial [block quote marker"
    +- "] from one or\n    "
     - more lines in which the next character other than a space or tab after the
     - "\n    [block quote marker] is [paragraph continuation\n    text]"
     - " is a block quote with *Bs* as its content.\n    "
    -- "[Paragraph continuation text](@)"
    -- " is text\n    "
    +- "[Paragraph continuation text](@) is text\n    "
     - "that will be parsed as part of the content of a paragraph, but does"
     - "\n    not occur at the beginning of the paragraph.\n\n"
     - "3.  "
    -- "**Consecutiveness.**"
    -- "  A document cannot contain two [block\n    quotes]"
    -- " in a row unless there is a [blank line] between them."
    -- "\n\nNothing else counts as a [block quote](#block-quotes).\n"
    +- "**Consecutiveness.**  A document cannot contain two [block"
    +- "\n    quotes] in a row unless there is a [blank line]"
    +- " between them.\n\n"
    +- "Nothing else counts as a [block quote](#block-quotes).\n"
     - "\nHere is a simple example:\n"
     - "\n"
     - "````````````````````````````````"
    @@ -2254,8 +2252,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    > # Foo\n> bar\n"
     - "> baz\n
    \n" - "````````````````````````````````\n" -- "\n\nThe Laziness clause allows us to omit the `>`" -- " before\n[paragraph continuation text]:\n\n" +- "\n\nThe Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:\n\n" - "````````````````````````````````" - " example\n" - "> # Foo\n> bar\nbaz\n.\n
    \n" @@ -2341,8 +2339,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n
    \n

    bar

    \n
    \n" - "````````````````````````````````\n" - "\n\n(Most current Markdown implementations, including John Gruber's\noriginal " -- "`Markdown.pl`" -- ", will parse this example as a single block quote\n" +- "`Markdown.pl`, will parse this example as a single block quote\n" - "with two paragraphs. But it seems better to allow the author to decide\n" - "whether two block quotes or one are wanted.)\n\n" - "Consecutiveness means that if we put these block quotes together,\n" @@ -2394,8 +2391,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n

    baz

    \n" - "````````````````````````````````\n" - "\n\nIt is a consequence of the Laziness rule that any number\n" -- "of initial `>`" -- "s may be omitted on a continuation line of a\nnested block quote:\n\n" +- "of initial `>`s may be omitted on a continuation line of a\n" +- "nested block quote:\n\n" - "````````````````````````````````" - " example\n" - "> > > foo\nbar\n.\n
    \n
    \n" @@ -2411,8 +2408,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nWhen including an indented code block in a block quote,\n" - "remember that the [block quote marker] includes\nboth the `>`" -- " and a following space of indentation. So *five spaces*" -- " are needed\nafter the `>`:\n\n" +- " and a following space of indentation. So *five spaces* are needed\n" +- "after the `>`:\n\n" - "````````````````````````````````" - " example\n" - "> code\n\n> not code\n.\n
    \n" @@ -2421,15 +2418,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "## List items\n\n" -- "A [list marker](@)" -- " is a\n[bullet list marker] or an [ordered list marker]." -- "\n\nA [bullet list marker](@)\nis a `-`, `+`" +- "A [list marker](@) is a\n[bullet list marker]" +- " or an [ordered list marker].\n\n" +- "A [bullet list marker](@)\nis a `-`, `+`" - ", or `*` character.\n\n" -- "An [ordered list marker](@)" -- "\nis a sequence of 1--9 arabic digits (" -- "`0-9`), followed by either a\n`.` character or a " -- "`)`" -- " character. (The reason for the length\n" +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`), followed by either a\n`.`" +- " character or a `)` character. (The reason for the length\n" - "limit is that with 10 digits we start seeing integer overflows\n" - "in some browsers.)\n\nThe following rules define [list items]:\n\n" - "1. " @@ -2437,20 +2432,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " constitute a sequence of\n blocks *Bs*" - " starting with a character other than a space or tab, and *M* is" - "\n a list marker of width *W* followed by 1 ≤ " -- "*N*" -- " ≤ 4 spaces of indentation,\n then the result of prepending " -- "*M* and the following spaces to the first line\n of " -- "*Ls*, and indenting subsequent lines of *Ls* by *W" -- " + N* spaces, is a\n list item with *Bs*" -- " as its contents. The type of the list item\n " +- "*N* ≤ 4 spaces of indentation,\n " +- then the result of prepending *M* +- " and the following spaces to the first line\n of *Ls*" +- ", and indenting subsequent lines of *Ls* by " +- "*W + N* spaces, is a\n list item with " +- "*Bs* as its contents. The type of the list item\n " - "(bullet or ordered) is determined by the type of its list marker.\n " - "If the list item is ordered, then it is also assigned a start\n " - "number, based on the ordered list marker.\n\n Exceptions:\n\n " - "1. When the first list item in a [list] interrupts\n " - "a paragraph---that is, when it starts on a line that would\n " - "otherwise count as [paragraph continuation text]---then (a)\n " -- "the lines *Ls* must not begin with a blank line, and (" -- "b) if\n " +- the lines *Ls* +- " must not begin with a blank line, and (b) if\n " - "the list item is ordered, the start number must be 1.\n " - "2. If any line is a [thematic break][thematic breaks]" - " then\n that line is not a list item.\n\n" @@ -2464,8 +2459,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    A block quote.

    \n
    \n" - "````````````````````````````````\n" - "\n\nAnd let *M* be the marker `1.`, and " -- "*N*" -- " = 2. Then rule #1 says\n" +- "*N* = 2. Then rule #1 says\n" - "that the following is an ordered list item with start number 1,\n" - "and the same contents as *Ls*:\n\n" - "````````````````````````````````" @@ -2528,14 +2522,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n
    \n
    \n" - "````````````````````````````````\n" - "\n\nHere `two` occurs in the same column as the list marker " -- "`1.`" -- ",\nbut is actually contained in the list item, because there is\n" -- sufficient indentation after the last containing blockquote marker. -- "\n\nThe converse is also possible. In the following example, the word " -- "`two`" -- "\n" -- "occurs far to the right of the initial text of the list item, `" -- "one`, but\n" +- "`1.`,\nbut is actually contained in the list item, because there is\n" +- "sufficient indentation after the last containing blockquote marker.\n\n" +- "The converse is also possible. In the following example, the word " +- "`two`\n" +- "occurs far to the right of the initial text of the list item, " +- "`one`, but\n" - "it is not considered part of the list item, because it is not indented" - "\nfar enough past the blockquote marker:\n\n" - "````````````````````````````````" @@ -2610,15 +2602,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "-1. not ok\n.\n

    -1. not ok

    \n" - "````````````````````````````````\n" - "\n\n\n2. **Item starting with indented code.**" -- " If a sequence of lines *Ls*" -- "\n constitute a sequence of blocks *Bs*" -- " starting with an indented code\n block, and *M*" -- " is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending " -- "*M* and the\n following space to the first line of " -- "*Ls*, and indenting subsequent lines\n of *Ls*" -- " by *W + 1* spaces, is a list item with *Bs" -- "* as its contents.\n " +- " If a sequence of lines *Ls*\n " +- constitute a sequence of blocks *Bs* starting with an indented code +- "\n block, and *M* is a list marker of width " +- "*W* followed by\n " +- "one space of indentation, then the result of prepending *M* and the" +- "\n following space to the first line of *Ls*" +- ", and indenting subsequent lines\n of *Ls* by " +- "*W + 1* spaces, is a list item with *Bs*" +- " as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - list item (bullet or ordered) is determined by the type of its list @@ -2645,8 +2637,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nIf the *first*" - " block in the list item is an indented code block,\n" -- "then by rule #2, the contents must be preceded by *one* space" -- " of indentation\nafter the list marker:\n\n" +- "then by rule #2, the contents must be preceded by *one*" +- " space of indentation\nafter the list marker:\n\n" - "````````````````````````````````" - " example\n" - " indented code\n\nparagraph\n\n more code\n.\n" @@ -2706,16 +2698,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````````````````````\n" - "\n\n3. **Item starting with a blank line.**" -- " If a sequence of lines *Ls*" -- "\n starting with a single [blank line] constitute a (possibly empty)" -- "\n sequence of blocks *Bs*, and *M*" -- " is a list marker of width *W*" -- ",\n then the result of prepending *M*" -- " to the first line of *Ls*" -- ", and\n preceding subsequent lines of *Ls* by " -- "*W + 1*" -- " spaces of indentation, is a\n list item with *Bs*" -- " as its contents.\n " +- " If a sequence of lines *Ls*\n starting with a single [" +- "blank line] constitute a (possibly empty)\n sequence of blocks *Bs*" +- ", and *M* is a list marker of width *W*,\n " +- "then the result of prepending *M* to the first line of " +- "*Ls*, and\n preceding subsequent lines of *Ls* by " +- "*W + 1* spaces of indentation, is a\n " +- "list item with *Bs* as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - list item (bullet or ordered) is determined by the type of its list @@ -2740,8 +2729,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````````````````````\n" - "\n\nA list item can begin with at most one blank line.\n" -- "In the following example, `foo`" -- " is not part of the list\nitem:\n\n" +- "In the following example, `foo` is not part of the list\n" +- "item:\n\n" - "````````````````````````````````" - " example\n" - "-\n\n foo\n.\n
      \n
    • \n
    \n" @@ -2754,9 +2743,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- foo\n-\n- bar\n.\n
      \n
    • foo
    • \n" - "
    • \n
    • bar
    • \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "It does not matter whether there are spaces or tabs following the [list marker]:\n" -- "\n" +- "\n\nIt does not matter whether there are spaces or tabs following the [list marker" +- "]:\n\n" - "````````````````````````````````" - " example\n" - "- foo\n- \n- bar\n.\n
    " - "````````````````````````````````" -- "Normally the `>`" -- that begins a block quote may be followed +- "Normally the `>` that begins a block quote may be followed" - "optionally by a space, which is not considered part of the" -- "content. In the following case `>`" -- "is followed by a tab," +- "content. In the following case `>` is followed by a tab," - which is treated as if it were expanded into three spaces. - "Since one of these spaces is considered part of the\ndelimiter, `foo`" - is considered to be indented six spaces @@ -313,8 +323,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "example\n*→*→*→\n.\n
    " - "````````````````````````````````" - "## Insecure characters" -- "For security reasons, the Unicode character `U+0000`" -- " must be replaced\nwith the REPLACEMENT CHARACTER (`U+FFFD`)." +- "For security reasons, the Unicode character `U+0000` must be replaced" +- "with the REPLACEMENT CHARACTER (`U+FFFD`)." - "## Backslash escapes\n\nAny ASCII punctuation character may be backslash-escaped:" - "````````````````````````````````" - example @@ -404,20 +414,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Entity and numeric character references" - Valid HTML entity references and numeric character references - "can be used in place of the corresponding Unicode character,\nwith the following exceptions:" -- "- Entity and character references are not recognized in code" -- blocks and code spans. -- "- Entity and character references cannot stand in place of" +- "-" +- "Entity and character references are not recognized in code\n blocks and code spans." +- "-" +- Entity and character references cannot stand in place of - special characters that define structural elements in -- "CommonMark. For example, although `*`" -- " can be used\n in place of a literal `*` character," -- "`*` cannot replace\n `*`" +- "CommonMark. For example, although `*` can be used" +- "in place of a literal `*` character, `*`" +- " cannot replace\n `*`" - " in emphasis delimiters, bullet list markers, or thematic\n breaks." - Conforming CommonMark parsers need not store information about - whether a particular character was represented in the source - using a Unicode character or an entity reference. -- "[Entity references](@) consist of `&`" -- " + any of the valid\nHTML5 entity names + `;`" -- ". The\ndocument " +- "[Entity references](@) consist of `&` + any of the valid" +- "HTML5 entity names + `;`. The\ndocument" +- "" - is used as an authoritative source for the valid entity - references and their corresponding code points. - "````````````````````````````````" @@ -430,21 +441,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "¾ ℋ ⅆ\n∲ ≧̸

    " - "````````````````````````````````" - "[Decimal numeric character\nreferences](@)\nconsist of `&#`" -- "+ a string of 1--7 arabic digits + `;`" -- ". A\nnumeric character reference is parsed as the corresponding" +- "+ a string of 1--7 arabic digits + `;`. A" +- numeric character reference is parsed as the corresponding - Unicode character. Invalid Unicode code points will be replaced by -- "the REPLACEMENT CHARACTER (`U+FFFD`" -- "). For security reasons,\nthe code point `U+0000`" -- "will also be replaced by `U+FFFD`." +- "the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons," +- "the code point `U+0000` will also be replaced by" +- "`U+FFFD`." - "````````````````````````````````" - example - "# Ӓ Ϡ �\n." - "

    # Ӓ Ϡ �

    " - "````````````````````````````````" -- "[Hexadecimal numeric character\nreferences](@) consist of `&#`" -- " +\neither `X` or `x`" -- "+ a string of 1-6 hexadecimal digits + `;`" -- ".\nThey too are parsed as the corresponding Unicode character (this" +- "[Hexadecimal numeric character\nreferences](@) consist of `&#` +" +- "either `X` or `x`" +- "+ a string of 1-6 hexadecimal digits + `;`." +- They too are parsed as the corresponding Unicode character (this - time specified with a hexadecimal numeral instead of decimal). - "````````````````````````````````" - example @@ -462,8 +473,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "&ThisIsNotDefined; &hi?;

    " - "````````````````````````````````" - Although HTML5 does accept some entity references -- "without a trailing semicolon (such as `©`" -- "), these are not\nrecognized here, because it makes the grammar too ambiguous:" +- "without a trailing semicolon (such as `©`), these are not" +- "recognized here, because it makes the grammar too ambiguous:" - "````````````````````````````````" - "example\n©\n.\n

    &copy

    " - "````````````````````````````````" @@ -547,8 +558,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "quotations, lists, headings, rules, and code blocks." - Some blocks (like - block quotes and list items) contain other blocks; others (like -- "headings and paragraphs) contain [inline](@)" -- "content---text," +- "headings and paragraphs) contain [inline](@) content---text," - "links, emphasized text, images, code spans, and so on." - "## Precedence" - Indicators of block structure always take precedence over indicators @@ -576,8 +586,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Markdown document. - "## Thematic breaks" - "A line consisting of optionally up to three spaces of indentation, followed by a" -- "sequence of three or more matching `-`, `_`, or `*` characters," -- "each followed\noptionally by any number of spaces or tabs, forms a" +- "sequence of three or more matching `-`, `_`, or `*`" +- "characters, each followed" +- "optionally by any number of spaces or tabs, forms a" - "[thematic break](@)." - "````````````````````````````````" - "example\n***\n---\n___\n.\n
    \n
    \n
    " @@ -673,9 +684,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## ATX headings" - "An [ATX heading](@)" - "consists of a string of characters, parsed as inline content, between an" -- "opening sequence of 1--6 unescaped `#`" -- " characters and an optional\nclosing sequence of any number of unescaped `#`" -- " characters.\nThe opening sequence of `#`" +- "opening sequence of 1--6 unescaped `#` characters and an optional" +- "closing sequence of any number of unescaped `#` characters." +- "The opening sequence of `#`" - "characters must be followed by spaces or tabs, or" - "by the end of line. The optional closing sequence of `#`" - s must be preceded by @@ -697,9 +708,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n####### foo\n.\n

    ####### foo

    " - "````````````````````````````````" -- "At least one space or tab is required between the `#`" -- " characters and the\nheading'" -- "s contents, unless the heading is empty. Note that many" +- "At least one space or tab is required between the `#` characters and the" +- "heading's contents, unless the heading is empty. Note that many" - "implementations currently do not require the space. However, the" - space was required by the - "[original ATX implementation](http://www.aaronsw.com/2002" @@ -757,8 +767,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n### foo ### \n.\n

    foo

    " - "````````````````````````````````" -- "A sequence of `#`" -- characters with anything but spaces or tabs following it +- "A sequence of `#` characters with anything but spaces or tabs following it" - "is not a closing sequence, but counts as part of the contents of the" - "heading:" - "````````````````````````````````" @@ -769,8 +778,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n# foo#\n.\n

    foo#

    " - "````````````````````````````````" -- "Backslash-escaped `#`" -- " characters do not count as part\nof the closing sequence:" +- "Backslash-escaped `#` characters do not count as part" +- "of the closing sequence:" - "````````````````````````````````" - example - "### foo \\###\n## foo #\\##\n# foo \\#\n." @@ -798,8 +807,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \n

    " - "````````````````````````````````" - "## Setext headings" -- "A [setext heading](@)" -- consists of one or more +- "A [setext heading](@) consists of one or more" - "lines of text, not interrupted by a blank line, of which the first line" - "does not\nhave more than 3 spaces of indentation, followed by\na" - "[setext heading underline]. The lines of text must be such" @@ -809,12 +817,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "],\n[block quote][block quotes], [thematic break][thematic breaks]," - "[list item][list items], or [HTML block][HTML blocks]." - "A [setext heading underline](@) is a sequence of\n`=`" -- "characters or a sequence of `-`" -- "characters, with no more than 3" +- "characters or a sequence of `-` characters, with no more than 3" - spaces of indentation and any number of trailing spaces or tabs. -- "The heading is a level 1 heading if `=`" -- " characters are used in\nthe [setext heading underline]" -- ", and a level 2 heading if `-`" +- "The heading is a level 1 heading if `=` characters are used in" +- "the [setext heading underline], and a level 2 heading if `-`" - characters are used. The contents of the heading are the result - "of parsing the preceding lines of text as CommonMark inline\ncontent." - "In general, a setext heading need not be preceded or followed by a" @@ -951,14 +957,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n-----\n.\n
    \n

    foo

    " - "
    \n
    " - "````````````````````````````````" -- "If you want a heading with `> foo` as its literal text, you" -- "can\nuse backslash escapes:" +- "If you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:" - "````````````````````````````````" - example - "\\> foo\n------\n.\n

    > foo

    " - "````````````````````````````````" -- "**Compatibility note:**" -- Most existing Markdown implementations +- "**Compatibility note:** Most existing Markdown implementations" - do not allow the text of setext headings to span multiple lines. - But there is no consensus about how to interpret - "``` markdown\nFoo\nbar\n---\nbaz\n```" @@ -996,10 +1001,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - baz

    - "````````````````````````````````" - "## Indented code blocks" -- "An [indented code block](@)" -- " is composed of one or more\n[indented chunks] separated by blank lines." -- "An [indented chunk](@) is a sequence of non-blank lines" -- ",\neach preceded by four or more spaces of indentation. The contents of the code" +- "An [indented code block](@) is composed of one or more" +- "[indented chunks] separated by blank lines.\nAn" +- "[indented chunk](@) is a sequence of non-blank lines," +- each preceded by four or more spaces of indentation. The contents of the code - "block are the literal contents of the lines, including trailing\n[line endings]" - ", minus four spaces of indentation.\nAn indented code block has no [" - "info string]." @@ -1088,16 +1093,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " foo \n.\n
    foo  \n
    " - "````````````````````````````````" - "## Fenced code blocks" -- "A [code fence](@)" -- " is a sequence\nof at least three consecutive backtick characters (`` ` ``" -- ") or\ntildes (`~`" +- "A [code fence](@) is a sequence" +- "of at least three consecutive backtick characters (`` ` ``) or" +- "tildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA" - "[fenced code block](@)" - "begins with a code fence, preceded by up to three spaces of indentation." - The line with the opening code fence may optionally contain some text - following the code fence; this is trimmed of leading and trailing -- "spaces or tabs and called the [info string](@)" -- ". If the [info string] comes" +- "spaces or tabs and called the [info string](@). If the [" +- "info string] comes" - "after a backtick fence, it may not contain any backtick" - characters. (The reason for this restriction is that otherwise - some inline code would be incorrectly interpreted as the @@ -1293,8 +1298,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` aa ```\nfoo\n.\n

    aa" - foo

    - "````````````````````````````````" -- "[Info strings] for tilde code blocks can contain backticks and" -- "tildes:" +- "[Info strings]" +- "for tilde code blocks can contain backticks and tildes:" - "````````````````````````````````" - example - "~~~ aa ``` ~~~\nfoo\n~~~\n." @@ -1307,30 +1312,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````````````````````" - "## HTML blocks" -- "An [HTML block](@)" -- is a group of lines that is treated +- "An [HTML block](@) is a group of lines that is treated" - as raw HTML (and will not be escaped in HTML output). - "There are seven kinds of [HTML block], which can be defined by their" - start and end conditions. The block begins with a line that meets a -- "[start condition](@)" -- (after up to three optional spaces of indentation). +- "[start condition](@) (after up to three optional spaces of indentation)." - It ends with the first subsequent line that meets a matching -- "[end condition](@), or the last line of the document, or the last" -- "line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the [" -- "end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line." +- "[end condition](@)" +- ", or the last line of the document, or the last line of\nthe" +- "[container block](#container-blocks) containing the current HTML" +- "block, if no line is encountered that meets the [end condition]. If" +- "the first line meets both the [start condition] and the [end\ncondition" +- "], the block will contain just that line." - "1." - "**Start condition:** line begins with the string ``" -- ", or the end of the line.\\\n**End condition:**" +- "``, or the end of the line.\\\n**End condition:**" - " line contains an end tag\n`
    `, ``" -- ", ``, or `` (case-insensitive;" -- "it\nneed not match the start tag)." -- "2. **Start condition:** line begins with the string ``." -- "3. **Start condition:** line begins with the string ``." - "4." - "**Start condition:** line begins with the string ``, or\nthe string `/>`.\\\n**End condition:**" +- "`title`, `tr`, `track`, `ul`, followed" +- "by a space, a tab, the end of the line, the string" +- "`>`, or\nthe string `/>`.\\\n**End condition:**" - "line is followed by a [blank line]." - "7." -- "**Start condition:**" -- " line begins with a complete [open tag]\n(with any [tag name]" -- " other than `pre`, `script`,\n`style`, or `textarea`" -- ") or a complete [closing tag]," +- "**Start condition:** line begins with a complete [open tag]\n(with any" +- "[tag name] other than `pre`, `script`,\n`style`, or" +- "`textarea`) or a complete [closing tag]," - "followed by zero or more spaces and tabs, followed by the end of the" - "line.\\\n**End condition:** line is followed by a [blank line]." - "HTML blocks continue until they are closed by their appropriate\n[end condition]" @@ -1373,8 +1377,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that might otherwise be recognised as a start condition will - "be ignored by the parser and passed through as-is, without changing\nthe parser" - "'s state." -- "For instance, `
    ` within an HTML block started by `` will"
    -- not affect
    +- "For instance, `
    ` within an HTML block started by `
    `" +- will not affect - the parser state; as the HTML block was started in by start condition 6 - ", it\nwill end at any blank line. This can be surprising:" - "````````````````````````````````" @@ -1385,8 +1389,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    world.\n

    " - "
    " - "````````````````````````````````" -- "In this case, the HTML block is terminated by the blank line — the `" -- "**Hello**`" +- "In this case, the HTML block is terminated by the blank line — the" +- "`**Hello**`" - "text remains verbatim — and regular parsing resumes, with a paragraph," - "emphasised `world` and inline and block HTML following." - "All types of [HTML blocks] except type 7 may interrupt" @@ -1475,8 +1479,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n``` c\nint x = 33;\n```\n." - "
    \n``` c\nint x = 33;\n```" - "````````````````````````````````" -- "To start an [HTML block] with a tag that is *not* in" -- the +- "To start an [HTML block] with a tag that is *not*" +- in the - "list of block-level tags in (6), you must put the tag by" - "itself on the first line (and it must be complete):" - "````````````````````````````````" @@ -1510,17 +1514,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````````````````````" - "In this case, we get a raw HTML block that just includes\nthe" -- "``" -- tag (because it ends with the following blank +- "`` tag (because it ends with the following blank" - "line). So the contents get interpreted as CommonMark:" - "````````````````````````````````" - example - "\n\n*foo*\n\n\n.\n" - "

    foo

    \n
    " - "````````````````````````````````" -- "Finally, in this case, the ``" -- " tags are interpreted\nas [raw HTML] *inside*" -- the CommonMark paragraph. (Because +- "Finally, in this case, the `` tags are interpreted\nas [" +- "raw HTML] *inside* the CommonMark paragraph. (Because" - "the tag is not on a line by itself, we get inline HTML" - "rather than an [HTML block].)" - "````````````````````````````````" @@ -1529,8 +1531,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    " - "````````````````````````````````" - "HTML tags designed to contain literal content\n(`pre`, `script`," -- "`style`, `textarea`" -- "), comments, processing instructions,\nand declarations are treated somewhat differently." +- "`style`, `textarea`), comments, processing instructions," +- and declarations are treated somewhat differently. - "Instead of ending at the first blank line, these blocks" - end at the first line containing a corresponding end tag. - "As a result, these blocks can contain blank lines:" @@ -1644,8 +1646,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n\n
    \n.\n
    " - "
    <div>\n
    " - "````````````````````````````````" -- "An HTML block of types 1--6 can interrupt a paragraph, and need" -- "not be\npreceded by a blank line." +- An HTML block of types 1-- +- "6 can interrupt a paragraph, and need not be" +- preceded by a blank line. - "````````````````````````````````" - example - "Foo\n
    \nbar\n
    \n.\n

    Foo

    " @@ -1702,14 +1705,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - Some Markdown implementations have adopted a convention of - "interpreting content inside tags as text if the open tag has\nthe attribute" -- "`markdown=1`" -- ". The rule given above seems a simpler and" +- "`markdown=1`. The rule given above seems a simpler and" - "more elegant way of achieving the same expressive power, which is also" - much simpler to parse. - The main potential drawback is that one can no longer paste HTML - "blocks into Markdown documents with 100% reliability. However," -- "*in most cases*" -- "this will work fine, because the blank lines in" +- "*in most cases* this will work fine, because the blank lines in" - "HTML are usually followed by HTML block tags. For example:" - "````````````````````````````````" - example @@ -1718,8 +1719,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````````````````````" - "There are problems, however, if the inner tags are indented" -- "*and*" -- "separated by spaces, as then they will be interpreted as" +- "*and* separated by spaces, as then they will be interpreted as" - "an indented code block:" - "````````````````````````````````" - example @@ -1730,18 +1730,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````````````````````" - "Fortunately, blank lines are usually not necessary and can be" -- "deleted. The exception is inside `
    `"
    -- " tags, but as described\n[above][HTML blocks]"
    -- ", raw HTML blocks starting with `
    `\n*can* contain blank lines."
    +- "deleted.  The exception is inside `
    ` tags, but as described"
    +- "[above][HTML blocks], raw HTML blocks starting with `
    `"
    +- "*can* contain blank lines."
     - "## Link reference definitions"
    -- "A [link reference definition](@)"
    -- "consists of a [link label], optionally preceded by up to three spaces of"
    -- "indentation, followed\nby a colon (`:`"
    -- "), optional spaces or tabs (including up to one\n[line ending]), a ["
    -- "link destination],\noptional spaces or tabs (including up to one\n[line ending]"
    -- "), and an optional [link\ntitle]"
    -- ", which if it is present must be separated\nfrom the [link destination]"
    -- " by spaces or tabs.\nNo further character may occur."
    +- "A [link reference definition](@)\nconsists of a [link label]"
    +- ", optionally preceded by up to three spaces of\nindentation, followed"
    +- "by a colon (`:`), optional spaces or tabs (including up to one"
    +- "[line ending]), a [link destination],"
    +- "optional spaces or tabs (including up to one\n[line ending]), and an optional"
    +- "[link\ntitle], which if it is present must be separated\nfrom the"
    +- "[link destination] by spaces or tabs.\nNo further character may occur."
     - "A [link reference definition]"
     - "does not correspond to a structural element of a document.  Instead, it"
     - "defines a label which can be used in [reference links]\nand reference-style ["
    @@ -1935,8 +1934,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````````````````````"
     - "## Paragraphs"
     - A sequence of non-blank lines that cannot be interpreted as other
    -- "kinds of blocks forms a [paragraph](@)"
    -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
    +- "kinds of blocks forms a [paragraph](@)."
    +- "The contents of the paragraph are the result of parsing the\nparagraph'"
     - "s raw content as inlines.  The paragraph's raw content"
     - is formed by concatenating the lines and removing initial and final
     - "spaces or tabs.\n\nA simple example with two paragraphs:"
    @@ -1996,29 +1995,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "

    aaa

    " - "````````````````````````````````" - "# Container blocks" -- "A [container block](#container-blocks)" -- is a block that has other +- "A [container block](#container-blocks) is a block that has other" - "blocks as its contents. There are two basic kinds of container blocks:\n[" - "block quotes] and [list items].\n[Lists] are meta-containers for" - "[list items]." - We define the syntax for container blocks recursively. The general - "form of the definition is:" -- "> If X is a sequence of blocks, then the result of\n>" +- ">" +- "If X is a sequence of blocks, then the result of\n>" - transforming X in such-and-such a way is a container of type Y - "> with these blocks as its content." - "So, we explain what counts as a block quote or list item by explaining" -- how these can be *generated* -- from their contents. This should suffice -- "to define the syntax, although it does not give a recipe for *parsing" -- "*\nthese constructions. (A recipe is provided below in the section entitled" +- how these can be *generated* from their contents. This should suffice +- "to define the syntax, although it does not give a recipe for" +- "*parsing*" +- these constructions. (A recipe is provided below in the section entitled - "[A parsing strategy](#appendix-a-parsing-strategy).)" - "## Block quotes" -- "A [block quote marker](@)" -- ",\noptionally preceded by up to three spaces of indentation," -- "consists of (a) the character `>`" -- together with a following space of -- "indentation, or (b) a single character `>` not followed by a" -- "space of\nindentation.\n\nThe following rules define [block quotes]:" +- "A [block quote marker](@)," +- "optionally preceded by up to three spaces of indentation," +- "consists of (a) the character `>` together with a following space of" +- "indentation, or (b) a single character `>`" +- " not followed by a space of\nindentation." +- "The following rules define [block quotes]:" - "1." - "**Basic case.** If a string of lines *Ls*" - " constitute a sequence\n of blocks *Bs*" @@ -2028,20 +2027,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "2." - "**Laziness.** If a string of lines *Ls*" - " constitute a [block\n quote](#block-quotes) with contents" -- "*Bs*" -- ", then the result of deleting\n the initial [block quote marker]" -- from one or +- "*Bs*, then the result of deleting\n the initial [block quote marker" +- "] from one or" - more lines in which the next character other than a space or tab after the - "[block quote marker] is [paragraph continuation\n text]" - is a block quote with *Bs* as its content. -- "[Paragraph continuation text](@)" -- is text +- "[Paragraph continuation text](@) is text" - "that will be parsed as part of the content of a paragraph, but does" - not occur at the beginning of the paragraph. - "3." -- "**Consecutiveness.**" -- " A document cannot contain two [block\n quotes]" -- "in a row unless there is a [blank line] between them." +- "**Consecutiveness.** A document cannot contain two [block" +- "quotes] in a row unless there is a [blank line] between them." - "Nothing else counts as a [block quote](#block-quotes)." - "Here is a simple example:" - "````````````````````````````````" @@ -2071,8 +2067,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    > # Foo\n> bar\n> baz"
     - "
    " - "````````````````````````````````" -- "The Laziness clause allows us to omit the `>`" -- " before\n[paragraph continuation text]:" +- "The Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:" - "````````````````````````````````" - example - "> # Foo\n> bar\nbaz\n.\n
    " @@ -2104,8 +2100,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • foo
  • \n\n
    \n
      " - "
    • bar
    • \n
    " - "````````````````````````````````" -- "For the same reason, we can't omit the `> `" -- " in front of\nsubsequent lines of an indented or fenced code block:" +- "For the same reason, we can't omit the `> ` in front of" +- "subsequent lines of an indented or fenced code block:" - "````````````````````````````````" - example - "> foo\n bar\n.\n
    \n
    foo"
    @@ -2127,9 +2123,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````````````````````"
     - "To see why, note that in"
     - "```markdown\n> foo\n>     - bar\n```"
    -- "the `- bar`"
    -- "is indented too far to start a list, and can't"
    -- be an indented code block because indented code blocks cannot
    +- "the `- bar` is indented too far to start a list, and can"
    +- "'t\nbe an indented code block because indented code blocks cannot"
     - "interrupt paragraphs, so it is [paragraph continuation text]."
     - "A block quote can be empty:"
     - "````````````````````````````````"
    @@ -2151,8 +2146,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    \n
    \n

    bar

    \n
    " - "````````````````````````````````" - "(Most current Markdown implementations, including John Gruber's\noriginal" -- "`Markdown.pl`" -- ", will parse this example as a single block quote" +- "`Markdown.pl`, will parse this example as a single block quote" - with two paragraphs. But it seems better to allow the author to decide - whether two block quotes or one are wanted.) - "Consecutiveness means that if we put these block quotes together," @@ -2199,8 +2193,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n

    baz

    " - "````````````````````````````````" - "It is a consequence of the Laziness rule that any number\nof initial" -- "`>`" -- "s may be omitted on a continuation line of a\nnested block quote:" +- "`>`s may be omitted on a continuation line of a" +- "nested block quote:" - "````````````````````````````````" - example - "> > > foo\nbar\n.\n
    \n
    " @@ -2215,8 +2209,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "When including an indented code block in a block quote,\nremember that the [" - "block quote marker] includes\nboth the `>`" -- and a following space of indentation. So *five spaces* -- " are needed\nafter the `>`:" +- and a following space of indentation. So *five spaces* are needed +- "after the `>`:" - "````````````````````````````````" - example - "> code\n\n> not code\n.\n
    " @@ -2224,34 +2218,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n

    not code

    \n
    " - "````````````````````````````````" - "## List items" -- "A [list marker](@)" -- " is a\n[bullet list marker] or an [ordered list marker]." +- "A [list marker](@) is a\n[bullet list marker]" +- "or an [ordered list marker]." - "A [bullet list marker](@)\nis a `-`, `+`, or" - "`*` character." -- "An [ordered list marker](@)" -- "is a sequence of 1--9 arabic digits (`0-9`" -- "), followed by either a\n`.` character or a `)`" -- character. (The reason for the length +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`), followed by either a\n`.`" +- "character or a `)` character. (The reason for the length" - limit is that with 10 digits we start seeing integer overflows - "in some browsers.)\n\nThe following rules define [list items]:" - "1." - "**Basic case.** If a sequence of lines *Ls*" - " constitute a sequence of\n blocks *Bs*" -- "starting with a character other than a space or tab, and *M*" -- " is\n a list marker of width *W* followed by 1 ≤" -- "*N* ≤ 4 spaces of indentation,\n then the result of prepending" +- "starting with a character other than a space or tab, and *M* is" +- a list marker of width *W* followed by 1 ≤ *N* +- " ≤ 4 spaces of indentation,\n then the result of prepending" - "*M* and the following spaces to the first line\n of" -- "*Ls*, and indenting subsequent lines of *Ls* by *W" -- "+ N* spaces, is a\n list item with *Bs*" -- as its contents. The type of the list item +- "*Ls*, and indenting subsequent lines of *Ls* by" +- "*W + N* spaces, is a\n list item with" +- "*Bs* as its contents. The type of the list item" - (bullet or ordered) is determined by the type of its list marker. - "If the list item is ordered, then it is also assigned a start" - "number, based on the ordered list marker.\n\n Exceptions:" - "1. When the first list item in a [list] interrupts" - "a paragraph---that is, when it starts on a line that would" - "otherwise count as [paragraph continuation text]---then (a)\n the lines" -- "*Ls*" -- "must not begin with a blank line, and (b) if" +- "*Ls* must not begin with a blank line, and (b) if" - "the list item is ordered, the start number must be 1." - "2. If any line is a [thematic break][thematic breaks]" - " then\n that line is not a list item." @@ -2326,8 +2318,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The converse is also possible. In the following example, the word" - "`two`" - "occurs far to the right of the initial text of the list item," -- "`one`" -- ", but" +- "`one`, but" - "it is not considered part of the list item, because it is not indented" - "far enough past the blockquote marker:" - "````````````````````````````````" @@ -2398,12 +2389,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - 2. **Item starting with indented code.** - " If a sequence of lines *Ls*\n constitute a sequence of blocks" - "*Bs* starting with an indented code\n block, and" -- "*M* is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending" -- "*M* and the\n following space to the first line of" -- "*Ls*, and indenting subsequent lines\n of *Ls* by" -- "*W + 1* spaces, is a list item with *Bs*" -- as its contents. +- "*M* is a list marker of width *W* followed by" +- "one space of indentation, then the result of prepending *M* and the" +- "following space to the first line of *Ls*, and indenting subsequent lines" +- of *Ls* by *W + 1* +- "spaces, is a list item with *Bs* as its contents." - "If a line is empty, then it need not be indented." - The type of the - list item (bullet or ordered) is determined by the type of its list @@ -2426,8 +2416,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    bar\n
    \n" - "" - "````````````````````````````````" -- If the *first* block in the list item is an indented code block -- ",\nthen by rule #2, the contents must be preceded by *one*" +- If the *first* +- "block in the list item is an indented code block," +- "then by rule #2, the contents must be preceded by *one*" - " space of indentation\nafter the list marker:" - "````````````````````````````````" - example @@ -2483,14 +2474,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````````````````````" - 3. **Item starting with a blank line.** -- If a sequence of lines *Ls* -- "starting with a single [blank line] constitute a (possibly empty)" -- "sequence of blocks *Bs*, and *M* is a list marker of width" -- "*W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of" -- "*Ls* by *W + 1*" -- " spaces of indentation, is a\n list item with *Bs*" -- as its contents. +- " If a sequence of lines *Ls*\n starting with a single [" +- "blank line] constitute a (possibly empty)\n sequence of blocks *Bs*" +- ", and *M* is a list marker of width *W*," +- then the result of prepending *M* to the first line of +- "*Ls*, and\n preceding subsequent lines of *Ls* by" +- "*W + 1* spaces of indentation, is a" +- list item with *Bs* as its contents. - "If a line is empty, then it need not be indented." - The type of the - list item (bullet or ordered) is determined by the type of its list @@ -2512,8 +2502,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````````````````````" - A list item can begin with at most one blank line. -- "In the following example, `foo`" -- " is not part of the list\nitem:" +- "In the following example, `foo` is not part of the list" +- "item:" - "````````````````````````````````" - example - "-\n\n foo\n.\n
      \n
    • \n
    " @@ -2550,8 +2540,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo\n1.

    " - "````````````````````````````````" - 4. **Indentation.** If a sequence of lines -- "*Ls*" -- constitutes a list item +- "*Ls* constitutes a list item" - "according to rule #1, #2, or #3, then the result" - "of preceding each line\n of *Ls*" - by up to three spaces of indentation (the same for each line) also @@ -2595,8 +2584,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - 5. **Laziness.** If a string of lines - "*Ls* constitute a [list\n item](#list-items)" -- with contents *Bs* -- ", then the result of deleting" +- "with contents *Bs*, then the result of deleting" - some or all of the indentation from one or more lines in which the - "next character other than a space or tab after the indentation is\n [" - "paragraph continuation text] is a" @@ -2689,22 +2677,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "### Motivation" - "John Gruber's Markdown spec says the following about list items:" -- "1. \"" -- "List markers typically start at the left margin, but may be indented" +- "1." +- "\"List markers typically start at the left margin, but may be indented" - by up to three spaces. List markers must be followed by one or more - "spaces or a tab.\"" -- "2. \"" -- "To make lists look nice, you can wrap items with hanging indents...." +- "2." +- "\"To make lists look nice, you can wrap items with hanging indents...." - "But if you don't want to, you don't have to.\"" -- "3. \"List items may consist of multiple paragraphs. Each subsequent" +- "3." +- "\"List items may consist of multiple paragraphs. Each subsequent" - paragraph in a list item must be indented by either 4 spaces or one - "tab.\"" -- "4. \"It looks nice if you indent every line of the subsequent paragraphs," +- "4." +- "\"It looks nice if you indent every line of the subsequent paragraphs," - "but here again, Markdown will allow you to be lazy.\"" -- "5. \"To put a blockquote within a list item, the blockquote" -- "'s `>`\n delimiters need to be indented.\"" -- "6. \"" -- "To put a code block within a list item, the code block needs to be" +- "5." +- "\"To put a blockquote within a list item, the blockquote's `>`" +- "delimiters need to be indented.\"" +- "6." +- "\"To put a code block within a list item, the code block needs to be" - "indented twice — 8 spaces or two tabs.\"" - These rules specify that a paragraph under a list item must be indented - "four spaces (presumably, from the left margin, rather than the start of" @@ -2714,15 +2705,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that a block quote must be indented, but not by how much; however" - ", the\nexample given has four spaces indentation. Although nothing is said" - "about other kinds of block-level content, it is certainly reasonable to\ninfer that" -- "*all*" -- "block elements under a list item, including other" +- "*all* block elements under a list item, including other" - "lists, must be indented four spaces. This principle has been called the" - "*four-space rule*." - "The four-space rule is clear and principled, and if the reference\nimplementation" -- "`Markdown.pl`" -- "had followed it, it probably would have" -- "become the standard. However, `Markdown.pl`" -- allowed paragraphs and +- "`Markdown.pl` had followed it, it probably would have" +- "become the standard. However, `Markdown.pl` allowed paragraphs and" - "sublists to start with only two spaces indentation, at least on the" - "outer level. Worse, its behavior was inconsistent: a sublist of an" - "outer-level list needed two spaces indentation, but a sublist of this" @@ -2737,8 +2725,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - is no way to give a spec for list items that will be guaranteed not - "to break any existing documents. However, the spec given here should" - "correctly handle lists formatted with either the four-space rule or\nthe more forgiving" -- "`Markdown.pl`" -- "behavior, provided they are laid out" +- "`Markdown.pl` behavior, provided they are laid out" - in a way that is natural for a human to read. - The strategy here is to let the width and indentation of the list marker - determine the indentation necessary for blocks to fall under the list @@ -2753,27 +2740,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - unnatural. It is quite unintuitive that - "``` markdown\n- foo\n\n bar\n\n - baz\n```" - "should be parsed as two lists with an intervening paragraph," -- "``` html\n
      \n
    • foo
    • \n
    " +- "``` html" +- "
      \n
    • foo
    • \n
    " - "

    bar

    \n
      \n
    • baz
    • " - "
    \n```" - "as the four-space rule demands, rather than a single list," -- "``` html\n
      \n
    • \n

      foo

      " -- "

      bar

      \n
        \n
      • baz
      • " -- "
      \n
    • \n
    \n```" +- "``` html" +- "
      \n
    • \n

      foo

      \n

      bar

      " +- "
        \n
      • baz
      • \n
      \n
    • " +- "
    \n```" - The choice of four spaces is arbitrary. - "It can be learned, but it is" - "not likely to be guessed, and it trips up beginners regularly." - Would it help to adopt a two-space rule? The problem is that such - "a rule, together with the rule allowing up to three spaces of indentation for" -- "the initial list marker, allows text that is indented *less than*" -- " the\noriginal list marker to be included in the list item. For example," +- "the initial list marker, allows text that is indented *less than* the" +- "original list marker to be included in the list item. For example," - "`Markdown.pl` parses" - "``` markdown\n - one\n\n two\n```" - "as a single list item, with `two` a continuation paragraph:" -- "``` html\n
      \n
    • \n

      one

      " -- "

      two

      \n
    • \n
    \n```\n\nand similarly" +- "``` html" +- "
      \n
    • \n

      one

      \n

      two

      " +- "
    • \n
    \n```\n\nand similarly" - "``` markdown\n> - one\n>\n> two\n```\n\nas" -- "``` html\n
    \n
      \n
    • \n

      one

      " +- "``` html" +- "
      \n
        \n
      • \n

        one

        " - "

        two

        \n
      • \n
      \n
      \n```" - This is extremely unintuitive. - "Rather than requiring a fixed indent from the margin, we could require" @@ -2805,30 +2796,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule in cases where the list marker plus its initial indentation - "takes four spaces (a common case), but diverge in other cases." - "## Lists" -- "A [list](@)" -- " is a sequence of one or more\nlist items [of the same type]" -- ". The list items\nmay be separated by any number of blank lines." +- "A [list](@) is a sequence of one or more\nlist items" +- "[of the same type]. The list items" +- may be separated by any number of blank lines. - "Two list items are [of the same type](@)" - "if they begin with a [list marker] of the same type." - Two list markers are of the - same type if (a) they are bullet list markers using the same character -- "(`-`, `+`, or `*`) or (b) they are" -- "ordered list numbers with the same\ndelimiter (either `.` or `)`)." +- "(`-`, `+`, or `*`" +- ) or (b) they are ordered list numbers with the same +- "delimiter (either `.` or `)`)." - "A list is an [ordered list](@)" - "if its constituent list items begin with\n[ordered list markers], and a" -- "[bullet list](@)" -- " if its constituent list\nitems begin with [bullet list markers]." -- "The [start number](@)" -- "of an [ordered list] is determined by the list number of" +- "[bullet list](@) if its constituent list\nitems begin with [" +- "bullet list markers]." +- "The [start number](@)\nof an [ordered list]" +- is determined by the list number of - its initial list item. The numbers of subsequent list items are - disregarded. -- "A list is [loose](@)" -- if any of its constituent +- "A list is [loose](@) if any of its constituent" - "list items are separated by blank lines, or if any of its constituent" - list items directly contain two block-level elements with a blank line -- "between them. Otherwise a list is [tight](@)" -- ".\n(The difference in HTML output is that paragraphs in a loose list are" -- "wrapped in `

      ` tags, while paragraphs in a tight list are not.)" +- "between them. Otherwise a list is [tight](@)." +- "(The difference in HTML output is that paragraphs in a loose list are\nwrapped in" +- "`

      ` tags, while paragraphs in a tight list are not.)" - "Changing the bullet or ordered list delimiter starts a new list:" - "````````````````````````````````" - example @@ -2850,31 +2841,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

        \n
      • bar
      • \n
      • baz
      • " - "
      " - "````````````````````````````````" -- "`Markdown.pl`" -- "does not allow this, through fear of triggering a list" +- "`Markdown.pl` does not allow this, through fear of triggering a list" - "via a numeral in a hard-wrapped line:" -- "``` markdown\nThe number of windows in my house is\n14." +- "``` markdown" +- "The number of windows in my house is\n14." - "The number of doors is 6.\n```" -- "Oddly, though, `Markdown.pl` *does* allow a blockquote" -- "to\ninterrupt a paragraph, even though the same considerations might\napply." +- "Oddly, though, `Markdown.pl` *does*" +- " allow a blockquote to\ninterrupt a paragraph, even though the same considerations might" +- apply. - "In CommonMark, we do allow lists to interrupt paragraphs, for" - "two reasons. First, it is natural and not uncommon for people" - "to start lists without blank lines:" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat" - "- a plane ticket\n```\n\nSecond, we are attracted to a" - ">" -- "[principle of uniformity](@)" -- ":\n> if a chunk of text has a certain\n>" +- "[principle of uniformity](@):\n>" +- "if a chunk of text has a certain\n>" - "meaning, it will continue to have the same meaning when put into a\n>" - container block (such as a list item or blockquote). - "(Indeed, the spec for [list items] and [block quotes]" -- "presupposes\nthis principle.) This principle implies that if" -- "``` markdown\n * I need to buy\n - new shoes" -- " - a coat\n - a plane ticket\n```" +- " presupposes\nthis principle.) This principle implies that if" +- "``` markdown" +- " * I need to buy\n - new shoes\n - a coat" +- " - a plane ticket\n```" - "is a list item containing a paragraph followed by a nested sublist," - "as all Markdown implementations agree it is (though the paragraph\nmay be rendered without" - "`

      ` tags, since the list is \"tight\"),\nthen" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat" - "- a plane ticket\n```" - by itself should be a paragraph followed by a nested sublist. - Since it is well established Markdown practice to allow lists to @@ -2884,8 +2879,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "takes a different approach, requiring blank lines before lists" - even inside other list items.) - In order to solve the problem of unwanted lists in paragraphs with -- "hard-wrapped numerals, we allow only lists starting with `1`" -- " to\ninterrupt paragraphs. Thus," +- "hard-wrapped numerals, we allow only lists starting with `1` to" +- "interrupt paragraphs. Thus," - "````````````````````````````````" - example - "The number of windows in my house is\n14." @@ -2955,8 +2950,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    • \n" - "````````````````````````````````" - "Note, however, that list items may not be preceded by more than" -- "three spaces of indentation. Here `- e`" -- is treated as a paragraph continuation +- "three spaces of indentation. Here `- e` is treated as a paragraph continuation" - "line, because it is indented more than three spaces:" - "````````````````````````````````" - example @@ -2965,8 +2959,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    • b
    • \n
    • c
    • \n
    • d" - "- e
    • \n
    " - "````````````````````````````````" -- "And here, `3. c` is treated as in indented code block" -- ",\nbecause it is indented four spaces and preceded by a\nblank line." +- "And here, `3. c`" +- "is treated as in indented code block," +- "because it is indented four spaces and preceded by a\nblank line." - "````````````````````````````````" - example - "1. a\n\n 2. b\n\n 3. c" @@ -3082,20 +3077,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`hi`lo`\n." - "

    hilo`

    " - "````````````````````````````````" -- "`hi` is parsed as code, leaving the backtick at the end as" -- "a literal\nbacktick." +- "`hi`" +- "is parsed as code, leaving the backtick at the end as a literal" +- backtick. - "## Code spans" - "A [backtick string](@)" - "is a string of one or more backtick characters (`` ` ``" - ") that is neither\npreceded nor followed by a backtick." -- "A [code span](@)" -- begins with a backtick string and ends with +- "A [code span](@) begins with a backtick string and ends with" - a backtick string of equal length. The contents of the code span are - "the characters between these two backtick strings, normalized in the\nfollowing ways:" - "- First, [line endings] are converted to [spaces]." -- "- If the resulting string both begins *and*" -- " ends with a [space]\n character, but does not consist entirely of [" -- "space]\n characters, a single [space] character is removed from the" +- "- If the resulting string both begins *and* ends with a [space]" +- "character, but does not consist entirely of [space]\n characters, a single" +- "[space] character is removed from the" - front and back. This allows you to include code that begins - "or ends with backtick characters, which must be separated by" - whitespace from the opening or closing backtick strings. @@ -3164,8 +3159,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo\\bar`

    " - "````````````````````````````````" - "Backslash escapes are never needed, because one can always choose a\nstring of" -- "*n*" -- "backtick characters as delimiters, where the code does" +- "*n* backtick characters as delimiters, where the code does" - not contain any strings of exactly *n* backtick characters. - "````````````````````````````````" - example @@ -3180,8 +3174,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Code span backticks have higher precedence than any other inline - constructs except HTML tags and autolinks. - "Thus, for example, this is" -- "not parsed as emphasized text, since the second `*` is part of a" -- "code\nspan:" +- "not parsed as emphasized text, since the second `*`" +- " is part of a code\nspan:" - "````````````````````````````````" - example - "*foo`*`\n.\n

    *foo*

    " @@ -3242,59 +3236,64 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "](https://daringfireball.net/projects/markdown/syntax#em)" - "says:" - ">" -- "Markdown treats asterisks (`*`) and underscores (`_`" -- ") as indicators of\n> emphasis. Text wrapped with one `*` or" -- "`_` will be wrapped with an HTML\n> ``" -- "tag; double `*`'s or `_`'s will be wrapped" -- "with an HTML ``\n> tag." +- "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of" +- "> emphasis. Text wrapped with one `*` or `_`" +- " will be wrapped with an HTML\n> `` tag; double" +- "`*`'s or `_`'s will be wrapped with an HTML" +- "``\n> tag." - "This is enough for most users, but these rules leave much undecided," - "especially when it comes to nested emphasis. The original\n`Markdown.pl`" - " test suite makes it clear that triple `***` and\n`___`" - "delimiters can be used for strong emphasis, and most" - "implementations have also allowed the following patterns:" -- "``` markdown\n***strong emph***\n***strong** in emph*" +- "``` markdown" +- "***strong emph***\n***strong** in emph*" - "***emph* in strong**\n**in strong *emph***" - "*in emph **strong***\n```" - "The following patterns are less widely supported, but the intent" - "is clear and they are useful (especially in contexts like bibliography\nentries):" -- "``` markdown\n*emph *with emph* in it*" +- "``` markdown" +- "*emph *with emph* in it*" - "**strong **with strong** in it**\n```" - "Many implementations have also restricted intraword emphasis to\nthe `*`" - "forms, to avoid unwanted emphasis in words containing" - internal underscores. (It is best practice to put these in code - "spans, but users often do not.)" -- "``` markdown\ninternal emphasis: foo*bar*baz" -- "no emphasis: foo_bar_baz\n```" +- "``` markdown" +- "internal emphasis: foo*bar*baz\nno emphasis: foo_bar_baz" +- "```" - "The rules given below capture all of these patterns, while allowing" - for efficient parsing strategies that do not backtrack. -- "First, some definitions. A [delimiter run](@)" -- " is either\na sequence of one or more `*`" -- " characters that is not preceded or\nfollowed by a non-backslash-escaped" -- "`*` character, or a sequence\nof one or more `_`" +- "First, some definitions. A [delimiter run](@) is either" +- "a sequence of one or more `*` characters that is not preceded or" +- "followed by a non-backslash-escaped `*`" +- " character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped" - "`_` character." -- "A [left-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not followed by [" -- "Unicode whitespace],\nand either (2a) not followed by a [" -- "Unicode punctuation character], or\n(2b) followed by a [" -- "Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [" -- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of" +- "A [left-flanking delimiter run](@) is\na [delimiter run" +- "] that is (1) not followed by [Unicode whitespace]," +- "and either (2a) not followed by a [Unicode punctuation character], or" +- "(2b) followed by a [Unicode punctuation character] and" +- "preceded by [Unicode whitespace] or a [Unicode punctuation character]." +- "For purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. -- "A [right-flanking delimiter run](@)" -- " is\na [delimiter run] that is (1) not preceded by [" -- "Unicode whitespace],\nand either (2a) not preceded by a [" -- "Unicode punctuation character], or\n(2b) preceded by a [" -- "Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [" -- "Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of" +- "A [right-flanking delimiter run](@) is\na [delimiter run" +- "] that is (1) not preceded by [Unicode whitespace]," +- "and either (2a) not preceded by a [Unicode punctuation character], or" +- "(2b) preceded by a [Unicode punctuation character] and" +- "followed by [Unicode whitespace] or a [Unicode punctuation character]." +- "For purposes of this definition, the beginning and the end of" - "the line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." -- " - left-flanking but not right-flanking:\n\n ```\n ***abc" -- " _abc\n **\"abc\"\n _\"abc\"\n ```" -- " - right-flanking but not left-flanking:\n\n ```\n abc***" -- " abc_\n \"abc\"**\n \"abc\"_\n ```" -- " - Both left and right-flanking:\n\n ```\n abc***def" -- " \"abc\"_\"def\"\n ```" -- " - Neither left nor right-flanking:\n\n ```\n abc *** def" -- " a _ b\n ```" +- "- left-flanking but not right-flanking:" +- " ```\n ***abc\n _abc\n **\"abc\"" +- " _\"abc\"\n ```" +- "- right-flanking but not left-flanking:" +- " ```\n abc***\n abc_\n \"abc\"**" +- "\"abc\"_\n ```" +- "- Both left and right-flanking:" +- " ```\n abc***def\n \"abc\"_\"def\"\n ```" +- "- Neither left nor right-flanking:" +- " ```\n abc *** def\n a _ b\n ```" - (The idea of distinguishing left-flanking and right-flanking - delimiter runs based on the character before and the character - "after comes from Roopesh Chander's" @@ -3310,50 +3309,51 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "iff (if and only if) it is part of a [" - "left-flanking delimiter run]." - "2." -- "A single `_`" -- " character [can open emphasis] iff\n it is part of a [" -- "left-flanking delimiter run]\n and either (a) not part of a" -- "[right-flanking delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." +- "A single `_` character [can open emphasis] iff" +- "it is part of a [left-flanking delimiter run]" +- "and either (a) not part of a [right-flanking delimiter run]" +- "or (b) part of a [right-flanking delimiter run]" +- "preceded by a [Unicode punctuation character]." - "3." - "A single `*` character [can close emphasis](@)" - "iff it is part of a [right-flanking delimiter run]." - "4." -- "A single `_`" -- " character [can close emphasis] iff\n it is part of a [" -- "right-flanking delimiter run]\n and either (a) not part of a" -- "[left-flanking delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "A single `_` character [can close emphasis] iff" +- "it is part of a [right-flanking delimiter run]" +- "and either (a) not part of a [left-flanking delimiter run]" +- "or (b) part of a [left-flanking delimiter run]" +- "followed by a [Unicode punctuation character]." - "5." - "A double `**` [can open strong emphasis](@)" - "iff it is part of a [left-flanking delimiter run]." - "6." -- "A double `__`" -- " [can open strong emphasis] iff\n it is part of a [" -- "left-flanking delimiter run]\n and either (a) not part of a" -- "[right-flanking delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." +- "A double `__` [can open strong emphasis] iff" +- "it is part of a [left-flanking delimiter run]" +- "and either (a) not part of a [right-flanking delimiter run]" +- "or (b) part of a [right-flanking delimiter run]" +- "preceded by a [Unicode punctuation character]." - "7." - "A double `**` [can close strong emphasis](@)" - "iff it is part of a [right-flanking delimiter run]." - "8." -- "A double `__`" -- " [can close strong emphasis] iff\n it is part of a [" -- "right-flanking delimiter run]\n and either (a) not part of a" -- "[left-flanking delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [Unicode punctuation character]." -- "9. Emphasis begins with a delimiter that [can open emphasis]" -- " and ends\n with a delimiter that [can close emphasis]" -- ", and that uses the same\n character (`_` or `*`" -- ) as the opening delimiter. The +- "A double `__` [can close strong emphasis] iff" +- "it is part of a [right-flanking delimiter run]" +- "and either (a) not part of a [left-flanking delimiter run]" +- "or (b) part of a [left-flanking delimiter run]" +- "followed by a [Unicode punctuation character]." +- "9." +- "Emphasis begins with a delimiter that [can open emphasis] and ends" +- "with a delimiter that [can close emphasis], and that uses the same" +- "character (`_` or `*`) as the opening delimiter. The" - "opening and closing delimiters must belong to separate\n [delimiter runs]" - ". If one of the delimiters can both" - "open and close emphasis, then the sum of the lengths of the" - delimiter runs containing the opening and closing delimiters - must not be a multiple of 3 unless both lengths are - multiples of 3. -- "10. Strong emphasis begins with a delimiter that\n [can open strong emphasis" -- "] and ends with a delimiter that\n [can close strong emphasis]" +- "10." +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis]" +- " and ends with a delimiter that\n [can close strong emphasis]" - ", and that uses the same character\n (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs]" @@ -3364,34 +3364,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - are multiples of 3. - "11." - "A literal `*` character cannot occur at the beginning or end of" -- "`*`-delimited emphasis or `**`-delimited strong emphasis" -- ", unless it\n is backslash-escaped." +- "`*`-delimited emphasis or `**`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." - "12." - "A literal `_` character cannot occur at the beginning or end of" -- "`_`-delimited emphasis or `__`-delimited strong emphasis" -- ", unless it\n is backslash-escaped." +- "`_`-delimited emphasis or `__`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings," - "the following principles resolve ambiguity:" -- "13. The number of nestings should be minimized. Thus, for example," +- "13." +- "The number of nestings should be minimized. Thus, for example," - "an interpretation `...` is always preferred to" - "`...`." - "14." -- "An interpretation `...`" -- " is always\n preferred to `...`" -- "." -- "15. When two potential emphasis or strong emphasis spans overlap," +- "An interpretation `...` is always" +- "preferred to `...`." +- "15." +- "When two potential emphasis or strong emphasis spans overlap," - so that the second begins before the first ends and ends after - "the first ends, the first takes precedence. Thus, for example," -- "`*foo _bar* baz_` is parsed as `foo" -- "_bar baz_` rather\n than" -- "`*foo bar* baz`." -- 16. When there are two potential emphasis or strong emphasis spans +- "`*foo _bar* baz_` is parsed as" +- "`foo _bar baz_` rather" +- "than `*foo bar* baz`." +- "16." +- When there are two potential emphasis or strong emphasis spans - "with the same closing delimiter, the shorter one (the one that" - "opens later) takes precedence. Thus, for example," -- "`**foo **bar baz**` is parsed as `**foo bar baz`\n rather than" +- "`**foo **bar baz**` is parsed as" +- "`**foo bar baz`\n rather than" - "`foo **bar baz`." -- "17. Inline code spans, links, images, and HTML tags group more tightly" +- "17." +- "Inline code spans, links, images, and HTML tags group more tightly" - "than emphasis. So, when there is a choice between an interpretation" - "that contains one of these elements and one that does not, the" - "former always wins. Thus, for example," @@ -3403,15 +3406,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "*foo bar*\n.\n

    foo bar

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `*`" -- " is followed by\nwhitespace, and hence not part of a [" -- "left-flanking delimiter run]:" +- "This is not emphasis, because the opening `*` is followed by" +- "whitespace, and hence not part of a [left-flanking delimiter run]:" - "````````````````````````````````" - "example\na * foo bar*\n.\n

    a * foo bar*

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `*`" -- " is preceded\nby an alphanumeric and followed by punctuation, and hence" -- "not part of a [left-flanking delimiter run]:" +- "This is not emphasis, because the opening `*` is preceded" +- "by an alphanumeric and followed by punctuation, and hence\nnot part of a [" +- "left-flanking delimiter run]:" - "````````````````````````````````" - example - "a*\"foo\"*\n." @@ -3445,13 +3447,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "_foo bar_\n.\n

    foo bar

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `_`" -- " is followed by\nwhitespace:" +- "This is not emphasis, because the opening `_` is followed by" +- "whitespace:" - "````````````````````````````````" - "example\n_ foo bar_\n.\n

    _ foo bar_

    " - "````````````````````````````````" -- "This is not emphasis, because the opening `_`" -- " is preceded\nby an alphanumeric and followed by punctuation:" +- "This is not emphasis, because the opening `_` is preceded" +- "by an alphanumeric and followed by punctuation:" - "````````````````````````````````" - example - "a_\"foo\"_\n." @@ -3471,8 +3473,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "пристаням_стремятся_\n." - "

    пристаням_стремятся_

    " - "````````````````````````````````" -- "Here `_`" -- "does not generate emphasis, because the first delimiter run" +- "Here `_` does not generate emphasis, because the first delimiter run" - "is right-flanking and the second left-flanking:" - "````````````````````````````````" - example @@ -3493,8 +3494,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n_foo*\n.\n

    _foo*

    " - "````````````````````````````````" -- "This is not emphasis, because the closing `*`" -- " is preceded by\nwhitespace:" +- "This is not emphasis, because the closing `*` is preceded by" +- "whitespace:" - "````````````````````````````````" - "example\n*foo bar *\n.\n

    *foo bar *

    " - "````````````````````````````````" @@ -3502,8 +3503,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n*foo bar\n*\n.\n

    *foo bar\n*

    " - "````````````````````````````````" -- "This is not emphasis, because the second `*`" -- " is\npreceded by punctuation and followed by an alphanumeric" +- "This is not emphasis, because the second `*` is" +- preceded by punctuation and followed by an alphanumeric - "(hence it is not part of a [right-flanking delimiter run]:" - "````````````````````````````````" - "example\n*(*foo)\n.\n

    *(*foo)

    " @@ -3521,13 +3522,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobar

    " - "````````````````````````````````" - "Rule 4:" -- "This is not emphasis, because the closing `_`" -- " is preceded by\nwhitespace:" +- "This is not emphasis, because the closing `_` is preceded by" +- "whitespace:" - "````````````````````````````````" - "example\n_foo bar _\n.\n

    _foo bar _

    " - "````````````````````````````````" -- "This is not emphasis, because the second `_`" -- " is\npreceded by punctuation and followed by an alphanumeric:" +- "This is not emphasis, because the second `_` is" +- "preceded by punctuation and followed by an alphanumeric:" - "````````````````````````````````" - "example\n_(_foo)\n.\n

    _(_foo)

    " - "````````````````````````````````" @@ -3568,9 +3569,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n** foo bar**\n.\n

    ** foo bar**

    " - "````````````````````````````````" -- "This is not strong emphasis, because the opening `**`" -- " is preceded\nby an alphanumeric and followed by punctuation, and hence" -- "not part of a [left-flanking delimiter run]:" +- "This is not strong emphasis, because the opening `**` is preceded" +- "by an alphanumeric and followed by punctuation, and hence\nnot part of a [" +- "left-flanking delimiter run]:" - "````````````````````````````````" - example - "a**\"foo\"**\n." @@ -3595,8 +3596,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    " - "````````````````````````````````" -- "This is not strong emphasis, because the opening `__`" -- " is preceded\nby an alphanumeric and followed by punctuation:" +- "This is not strong emphasis, because the opening `__` is preceded" +- "by an alphanumeric and followed by punctuation:" - "````````````````````````````````" - example - "a__\"foo\"__\n." @@ -3635,10 +3636,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n**foo bar **\n.\n

    **foo bar **

    " - "````````````````````````````````" -- "(Nor can it be interpreted as an emphasized `*foo bar *`, because" -- "of\nRule 11.)" -- "This is not strong emphasis, because the second `**`" -- " is\npreceded by punctuation and followed by an alphanumeric:" +- "(Nor can it be interpreted as an emphasized `*foo bar *`" +- ", because of\nRule 11.)" +- "This is not strong emphasis, because the second `**` is" +- "preceded by punctuation and followed by an alphanumeric:" - "````````````````````````````````" - "example\n**(**foo)\n.\n

    **(**foo)

    " - "````````````````````````````````" @@ -3677,8 +3678,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "__foo bar __\n.\n

    __foo bar __

    " - "````````````````````````````````" -- "This is not strong emphasis, because the second `__`" -- " is\npreceded by punctuation and followed by an alphanumeric:" +- "This is not strong emphasis, because the second `__` is" +- "preceded by punctuation and followed by an alphanumeric:" - "````````````````````````````````" - "example\n__(__foo)\n.\n

    __(__foo)

    " - "````````````````````````````````" @@ -3764,8 +3765,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobar" - "baz

    \n```" - is precluded by the condition that a delimiter that -- "can both open and close (like the `*` after `foo`" -- ")\ncannot form emphasis if the sum of the lengths of" +- "can both open and close (like the `*` after `foo`)" +- cannot form emphasis if the sum of the lengths of - the delimiter runs containing the opening and - closing delimiters is a multiple of 3 unless - both lengths are multiples of 3. @@ -3937,8 +3938,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo **_**\n.\n

    foo _

    " - "````````````````````````````````" - "Note that when delimiters do not match evenly, Rule 11 determines" -- "that the excess literal `*`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `*` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````````````````````" - example - "**foo*\n.\n

    *foo

    " @@ -3990,8 +3991,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "__foo_\n.\n

    _foo

    " - "````````````````````````````````" - "Note that when delimiters do not match evenly, Rule 12 determines" -- "that the excess literal `_`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `_` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````````````````````" - example - "_foo__\n.\n

    foo_

    " @@ -4139,62 +4140,63 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "There are two basic kinds of links in Markdown. In [inline links]" - " the\ndestination and title are given immediately after the link text. In" - "[reference links] the destination and title are defined elsewhere in\nthe document." -- "A [link text](@)" -- " consists of a sequence of zero or more\ninline elements enclosed by square brackets (" -- "`[` and `]`). The\nfollowing rules apply:" -- "- Links may not contain other links, at any level of nesting. If" +- "A [link text](@) consists of a sequence of zero or more" +- "inline elements enclosed by square brackets (`[` and `]`). The" +- "following rules apply:" +- "-" +- "Links may not contain other links, at any level of nesting. If" - multiple otherwise valid link definitions appear nested inside each - "other, the inner-most definition is used." -- "- Brackets are allowed in the [link text]" -- only if (a) they +- "-" +- "Brackets are allowed in the [link text] only if (a) they" - are backslash-escaped or (b) they appear as a matched pair of - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket" - "`]`." -- "- Backtick [code spans], [autolinks], and raw [HTML" -- "tags] bind more tightly" +- "-" +- "Backtick [code spans], [autolinks], and raw [HTML tags" +- "] bind more tightly" - "than the brackets in link text. Thus, for example," - "`` [foo`]` `` could not be a link text, since the second" - "`]`\n is part of a code span." -- "- The brackets in link text bind more tightly than markers for\n [" +- "-" +- "The brackets in link text bind more tightly than markers for\n [" - "emphasis and strong emphasis]. Thus, for example," - "`*[foo*](url)` is a link." - "A [link destination](@) consists of either" - "-" -- "a sequence of zero or more characters between an opening `<`" -- " and a\n closing `>` that contains no line endings or unescaped" -- "`<` or `>` characters, or" +- "a sequence of zero or more characters between an opening `<` and a" +- "closing `>` that contains no line endings or unescaped\n `<` or" +- "`>` characters, or" - "-" -- "a nonempty sequence of characters that does not start with `<`" -- ",\n does not include [ASCII control characters][ASCII control character]\n or" -- "[space] character, and includes parentheses only if (a) they are" +- "a nonempty sequence of characters that does not start with `<`," +- "does not include [ASCII control characters][ASCII control character]\n or [space" +- "] character, and includes parentheses only if (a) they are" - backslash-escaped or (b) they are part of a balanced pair of - "unescaped parentheses.\n (Implementations may impose limits on parentheses nesting to" - "avoid performance issues, but at least three levels of nesting" - "should be supported.)\n\nA [link title](@) consists of either" - "-" - "a sequence of zero or more characters between straight double-quote\n characters (`\"`" -- "), including a `\"`" -- " character only if it is\n backslash-escaped, or" +- "), including a `\"` character only if it is" +- "backslash-escaped, or" - "-" - "a sequence of zero or more characters between straight single-quote\n characters (" -- "`'`), including a `'`" -- " character only if it is\n backslash-escaped, or" +- "`'`), including a `'` character only if it is" +- "backslash-escaped, or" - "-" - "a sequence of zero or more characters between matching parentheses\n (`(...)`" -- "), including a `(` or `)`" -- " character only if it is\n backslash-escaped." +- "), including a `(` or `)` character only if it is" +- backslash-escaped. - "Although [link titles] may span multiple lines, they may not contain\na" - "[blank line]." -- "An [inline link](@)" -- " consists of a [link text] followed immediately\nby a left parenthesis `(`" -- ", an optional [link destination], an optional\n[link title]" -- ", and a right parenthesis `)`" -- "." +- "An [inline link](@) consists of a [link text] followed immediately" +- "by a left parenthesis `(`, an optional [link destination], an optional" +- "[link title], and a right parenthesis `)`." - "These four components may be separated by spaces, tabs, and up to one line" - "ending.\nIf both [link destination] and [link title]" -- "are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending." +- "are present, they *must* be" +- "separated by spaces, tabs, and up to one line ending." - "The link's text consists of the inlines contained\nin the [link text" - "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n`<...>`" @@ -4389,15 +4391,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    link

    " - "````````````````````````````````" -- "(Note: `Markdown.pl`" -- did allow double quotes inside a double-quoted +- "(Note: `Markdown.pl` did allow double quotes inside a double-quoted" - "title, and its test suite included a test demonstrating this." - But it is hard to see a good rationale for the extra complexity this - "brings, since there are already many ways---backslash escaping," - "entity and numeric character references, or using a different" - "quote type for the enclosing title---to write titles containing\ndouble quotes." -- "`Markdown.pl`" -- "'s handling of titles has a number" +- "`Markdown.pl`'s handling of titles has a number" - "of other strange features. For example, it allows single-quoted" - "titles in inline links, but not reference links. And, in" - "reference links but not inline links, it allows a title to begin\nwith" @@ -4487,8 +4487,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo *bar](baz*)\n." - "

    foo *bar

    " - "````````````````````````````````" -- "Note that brackets that *aren't*" -- " part of links do not take\nprecedence:" +- "Note that brackets that *aren't* part of links do not take" +- "precedence:" - "````````````````````````````````" - example - "*foo [bar* baz]\n." @@ -4514,13 +4514,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "search=](uri)

    " - "````````````````````````````````" - "There are three kinds of [reference link](@)s:" -- "[full](#full-reference-link), [collapsed](#collapsed-reference-link)" -- ",\nand [shortcut](#shortcut-reference-link)." -- "A [full reference link](@)" -- "consists of a [link text] immediately followed by a [link label]" -- "that [matches] a [link reference definition] elsewhere in the document." -- "A [link label](@) begins with a left bracket (`[`)" -- "and ends\nwith the first right bracket (`]`" +- "[full](#full-reference-link), [collapsed](#collapsed-reference-link)," +- "and [shortcut](#shortcut-reference-link)." +- "A [full reference link](@)\nconsists of a [link text]" +- " immediately followed by a [link label]\nthat [matches] a [" +- "link reference definition] elsewhere in the document." +- "A [link label](@) begins with a left bracket (`[`" +- ") and ends\nwith the first right bracket (`]`" - ) that is not backslash-escaped. - "Between these brackets there must be at least one character that is not a space," - "tab, or line ending.\nUnescaped square bracket characters are not allowed inside the" @@ -4529,8 +4529,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One label [matches](@)" - another just in case their normalized forms are equal. To normalize a - "label, strip off the opening and closing brackets,\nperform the" -- "*Unicode case fold*" -- ", strip leading and trailing" +- "*Unicode case fold*, strip leading and trailing" - "spaces, tabs, and line endings, and collapse consecutive internal" - "spaces, tabs, and line endings to a single space." - If there are multiple @@ -4644,7 +4643,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    Baz

    " - "````````````````````````````````" - "No spaces, tabs, or line endings are allowed between the [link text]" -- "and the\n[link label]:" +- " and the\n[link label]:" - "````````````````````````````````" - example - "[foo] [bar]\n\n[bar]: /url \"title\"\n." @@ -4666,11 +4665,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". If whitespace is allowed between the" - "link text and the link label, then in the following we will have" - "a single reference link, not two shortcut reference links, as\nintended:" -- "``` markdown\n[foo]\n[bar]\n\n[foo]: /url1" -- "[bar]: /url2\n```\n\n(Note that [shortcut reference links]" -- " were introduced by Gruber\nhimself in a beta version of" -- "`Markdown.pl`" -- ", but never included\nin the official syntax description. Without shortcut reference" +- "``` markdown" +- "[foo]\n[bar]\n\n[foo]: /url1" +- "[bar]: /url2\n```" +- "(Note that [shortcut reference links] were introduced by Gruber" +- "himself in a beta version of `Markdown.pl`, but never included" +- in the official syntax description. Without shortcut reference - "links, it is harmless to allow space between the link text and" - "link label; but once shortcut references are introduced, it is" - "too dangerous to allow this, as it frequently leads to" @@ -4721,8 +4721,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[bar\\\\]: /uri\n\n[bar\\\\]\n." - "

    bar\\

    " - "````````````````````````````````" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:" +- "A [link label]" +- "must contain at least one character that is not a space, tab, or" +- "line ending:" - "````````````````````````````````" - example - "[]\n\n[]: /uri\n.\n

    []

    " @@ -4733,11 +4734,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [" - "]: /uri

    " - "````````````````````````````````" -- "A [collapsed reference link](@)" -- "consists of a [link label] that [matches] a\n[" -- "link reference definition] elsewhere in the\ndocument, followed by the string" -- "`[]`" -- ".\nThe contents of the link label are parsed as inlines," +- "A [collapsed reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the" +- "document, followed by the string `[]`." +- "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link'" - s URI and title are - "provided by the matching reference link definition. Thus,\n`[foo][]`" @@ -4770,11 +4770,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo" - "[]

    " - "````````````````````````````````" -- "A [shortcut reference link](@)" -- "consists of a [link label] that [matches] a\n[" -- "link reference definition] elsewhere in the\ndocument and is not followed by" -- "`[]`" -- or a link label. +- "A [shortcut reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the" +- "document and is not followed by `[]` or a link label." - "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link's URI and title" - "are provided by the matching link reference definition.\nThus, `[foo]`" @@ -4861,8 +4859,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo][bar][baz]\n\n[baz]: /url\n." - "

    [foo]bar

    " - "````````````````````````````````" -- "Here, though, `[foo][bar]` is parsed as a reference," -- "since\n`[bar]` is defined:" +- "Here, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:" - "````````````````````````````````" - example - "[foo][bar][baz]\n\n[baz]: /url1" @@ -4870,10 +4868,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobaz

    " - "````````````````````````````````" -- "Here `[foo]`" -- "is not parsed as a shortcut reference, because it" -- "is followed by a link label (even though `[bar]` is not defined" -- "):" +- "Here `[foo]` is not parsed as a shortcut reference, because it" +- "is followed by a link label (even though `[bar]`" +- "is not defined):" - "````````````````````````````````" - example - "[foo][bar][baz]\n\n[baz]: /url1" @@ -4883,12 +4880,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Images" - "Syntax for images is like the syntax for links, with one" - "difference. Instead of [link text], we have an" -- "[image description](@)" -- ". The rules for this are the\nsame as for [link text]" -- ", except that (a) an\nimage description starts with `![`" -- "rather than `[`" -- ", and\n(b) an image description may contain links." -- An image description has inline elements +- "[image description](@). The rules for this are the\nsame as for" +- "[link text], except that (a) an\nimage description starts with" +- "`![` rather than `[`, and" +- "(b) an image description may contain links.\nAn image description has inline elements" - "as its contents. When an image is rendered to HTML," - "this is standardly used as the image's `alt` attribute." - "````````````````````````````````" @@ -4917,8 +4912,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Though this spec is concerned with parsing, not rendering, it is" - "recommended that in rendering to HTML, only the plain string content\nof the [" - "image description] be used. Note that in" -- "the above example, the alt attribute's value is `foo bar`, not `" -- "foo\n[bar](/url)` or" +- "the above example, the alt attribute's value is `foo bar`, not" +- "`foo\n[bar](/url)` or" - "`foo bar`" - ". Only the plain string\ncontent is rendered, without formatting." - "````````````````````````````````" @@ -5045,20 +5040,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`<` and `>`" - ". They are parsed as links, with the URL or email address" - as the link label. -- "A [URI autolink](@) consists of `<`" -- ", followed by an\n[absolute URI] followed by `>`" -- ". It is parsed as" +- "A [URI autolink](@) consists of `<`, followed by an" +- "[absolute URI] followed by `>`. It is parsed as" - "a link to the URI, with the URI as the link's label." -- "An [absolute URI](@)" -- ",\nfor these purposes, consists of a [scheme] followed by a colon (" -- "`:`" -- ")\nfollowed by zero or more characters other than [ASCII control\ncharacters][" -- "ASCII control character], [space], `<`, and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded" -- "(e.g. `%20` for a space)." -- "For purposes of this spec, a [scheme](@)" -- " is any sequence\nof 2--" -- 32 characters beginning with an ASCII letter and followed +- "An [absolute URI](@),\nfor these purposes, consists of a [scheme" +- "] followed by a colon (`:`)" +- "followed by zero or more characters other than [ASCII control\ncharacters][" +- "ASCII control character], [space], `<`, and `>`." +- "If the URI includes these characters, they must be percent-encoded\n(e.g." +- "`%20` for a space)." +- "For purposes of this spec, a [scheme](@) is any sequence" +- of 2--32 characters beginning with an ASCII letter and followed - "by any combination of ASCII letters, digits, or the symbols plus\n(\"+\"" - "), period (\".\"), or hyphen (\"-\")." - "Here are some valid autolinks:" @@ -5134,8 +5126,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is" - "`mailto:` followed by the email address." -- "An [email address](@)" -- ",\nfor these purposes, is anything that matches\nthe" +- "An [email address](@),\nfor these purposes, is anything that matches" +- the - "[non-normative regex from the HTML5\nspec" - "](https://html.spec.whatwg.org/multipage/forms.html#e-mail" - "-state-(type=email)):" @@ -5193,18 +5185,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo@bar.example.com

    " - "````````````````````````````````" - "## Raw HTML" -- "Text between `<` and `>` that looks like an HTML tag is parsed as" -- "a\nraw HTML tag and will be rendered in HTML without escaping." +- "Text between `<` and `>`" +- that looks like an HTML tag is parsed as a +- raw HTML tag and will be rendered in HTML without escaping. - "Tag and attribute names are not limited to current HTML tags," - "so custom tags (and even, say, DocBook tags) may be used" - ".\n\nHere is the grammar for tags:" -- "A [tag name](@)" -- consists of an ASCII letter +- "A [tag name](@) consists of an ASCII letter" - "followed by zero or more ASCII letters, digits, or" - "hyphens (`-`)." -- "An [attribute](@) consists of spaces, tabs, and up to one" -- "line ending,\nan [attribute name], and an optional\n[attribute value specification" -- "]." +- "An [attribute](@)" +- " consists of spaces, tabs, and up to one line ending,\nan [" +- "attribute name], and an optional\n[attribute value specification]." - "An [attribute name](@)\nconsists of an ASCII letter, `_`" - ", or `:`, followed by zero or more ASCII\nletters, digits," - "`_`, `.`, `:`, or `-`" @@ -5212,16 +5204,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - specification restricted to ASCII. HTML5 is laxer.) - "An [attribute value specification](@)" - "consists of optional spaces, tabs, and up to one line ending,\na" -- "`=`" -- " character, optional spaces, tabs, and up to one line ending,\nand an" -- "[attribute value]." -- "An [attribute value](@)" -- "consists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value]." +- "`=` character, optional spaces, tabs, and up to one line ending," +- "and an [attribute value]." +- "An [attribute value](@)\nconsists of an [unquoted attribute value" +- "],\na [single-quoted attribute value], or a [" +- "double-quoted attribute value]." - "An [unquoted attribute value](@)" - is a nonempty string of characters not -- "including spaces, tabs, line endings, `\"`, `'`, `=`, `<" -- "`, `>`, or `` ` ``." +- "including spaces, tabs, line endings, `\"`, `'`, `=`," +- "`<`, `>`, or `` ` ``." - "A [single-quoted attribute value](@)\nconsists of `'`" - ", zero or more\ncharacters not including `'`, and a final `'`." - "A [double-quoted attribute value](@)\nconsists of `\"`" @@ -5230,13 +5221,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional" - "`/` character, and a `>` character." -- "A [closing tag](@) consists of the string ``." -- "An [HTML comment](@) consists of ``, `" -- "`, or ``, and `-->` (see the" +- "An [HTML comment](@) consists of ``," +- "``, or ``, and `-->` (see the" - "[HTML spec](https://html.spec.whatwg.org/multipage/" - "parsing.html#markup-declaration-open-state))." - "A [processing instruction](@)\nconsists of the string `` tag):" +- "is parsed as a [hard line break](@) (rendered" +- "in HTML as a `
    ` tag):" - "````````````````````````````````" - "example\nfoo \nbaz\n.\n

    foo
    \nbaz

    " - "````````````````````````````````" @@ -5490,27 +5481,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In this appendix we describe some features of the parsing strategy - used in the CommonMark reference implementations. - "## Overview\n\nParsing has two phases:" -- "1. In the first phase, lines of input are consumed and the block" +- "1." +- "In the first phase, lines of input are consumed and the block" - "structure of the document---its division into paragraphs, block quotes," - "list items, and so on---is constructed. Text is assigned to these" - blocks but not parsed. Link reference definitions are parsed and a - map of links is constructed. -- "2. In the second phase, the raw text contents of paragraphs and headings" +- "2." +- "In the second phase, the raw text contents of paragraphs and headings" - "are parsed into sequences of Markdown inline elements (strings," - "code spans, links, emphasis, and so on), using the map of link" - references constructed in phase 1. - "At each point in processing, the document is represented as a tree of" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as" -- "**children**" -- ". These children" +- "**children**. These children" - "may, in turn, have other blocks as children." - "The last child of a block\nis normally considered **open**" - ", meaning that subsequent lines of input" - can alter its contents. (Blocks that are not open are **closed** - ".)\nHere, for example, is a possible document tree, with the open blocks" - "marked by arrows:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - " -> list (type=bullet tight=true bullet_char=-)\n list_item" - " paragraph\n \"Qui *quodsi iracundia*\"" @@ -5528,7 +5521,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Once a line has been incorporated into the tree in this way," - "it can be discarded, so input can be read in a stream." - "For each line, we follow this procedure:" -- "1. First we iterate through the open blocks, starting with the" +- "1." +- "First we iterate through the open blocks, starting with the" - "root document, and descending through last children down to the last" - open block. Each block imposes a condition that the line must satisfy - if the block is to remain open. @@ -5537,14 +5531,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In this phase we may match all or just some of the open - "blocks. But we cannot close unmatched blocks yet, because we may have a" - "[lazy continuation line]." -- "2. Next, after consuming the continuation markers for existing" -- "blocks, we look for new block starts (e.g. `>` for a" -- "block quote).\nIf we encounter a new block start, we close any blocks unmatched" +- "2." +- "Next, after consuming the continuation markers for existing" +- "blocks, we look for new block starts (e.g. `>`" +- for a block quote). +- "If we encounter a new block start, we close any blocks unmatched" - in step 1 before creating the new block as a child of the last - matched container block. -- "3. Finally, we look at the remainder of the line (after block" -- "markers like `>`" -- ", list markers, and indentation have been consumed)." +- "3." +- "Finally, we look at the remainder of the line (after block\nmarkers like" +- "`>`, list markers, and indentation have been consumed)." - This is text that can be incorporated into the last open - "block (a paragraph, code block, heading, or raw HTML)." - Setext headings are formed when we see a line of a paragraph @@ -5554,24 +5550,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one or more reference link definitions. Any remainder becomes a\nnormal paragraph." - We can see how this works by considering how the tree above is - "generated by four lines of Markdown:" -- "``` markdown\n> Lorem ipsum dolor\nsit amet." +- "``` markdown" +- "> Lorem ipsum dolor\nsit amet." - "> - Qui *quodsi iracundia*" - "> - aliquando id\n```" - "At the outset, our document model is just" - "``` tree\n-> document\n```\n\nThe first line of our text," - "``` markdown\n> Lorem ipsum dolor\n```" -- "causes a `block_quote`" -- " block to be created as a child of our\nopen `document`" -- " block, and a `paragraph` block as a child of\nthe" -- "`block_quote`" -- ". Then the text is added to the last open\nblock, the" -- "`paragraph`:" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "causes a `block_quote` block to be created as a child of our" +- "open `document` block, and a `paragraph` block as a child of" +- "the `block_quote`. Then the text is added to the last open" +- "block, the `paragraph`:" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\"\n```\n\nThe next line," - "``` markdown\nsit amet.\n```" -- "is a \"lazy continuation\" of the open `paragraph`" -- ", so it gets added\nto the paragraph's text:" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "is a \"lazy continuation\" of the open `paragraph`, so it gets added" +- "to the paragraph's text:" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\\nsit amet.\"\n```\n\nThe third line," - "``` markdown\n> - Qui *quodsi iracundia*\n```" - "causes the `paragraph` block to be closed, and a new" @@ -5579,7 +5576,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". A `list_item` is also\nadded as a child of the" - "`list`, and a `paragraph` as a child of\nthe" - "`list_item`. The text is then added to the new `paragraph`:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-)" - " -> list_item\n -> paragraph" @@ -5590,7 +5588,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "opened up as child of the `list`. A `paragraph`" - "is added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - " -> list (type=bullet tight=true bullet_char=-)\n list_item" - " paragraph\n \"Qui *quodsi iracundia*\"" @@ -5602,15 +5601,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - string contents of paragraphs and headings as inlines. At this - "point we have seen all the link reference definitions, so we can" - resolve reference links as we go. -- "``` tree\ndocument\n block_quote\n paragraph" -- " str \"Lorem ipsum dolor\"\n softbreak" -- "str \"sit amet.\"" +- "``` tree" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"" +- " softbreak\n str \"sit amet.\"" - " list (type=bullet tight=true bullet_char=-)\n list_item" - " paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item" -- " paragraph\n str \"aliquando id\"\n```\n\nNotice how the" -- "[line ending] in the first paragraph has\nbeen parsed as a" -- "`softbreak`" +- " paragraph\n str \"aliquando id\"\n```" +- "Notice how the [line ending] in the first paragraph has" +- "been parsed as a `softbreak`" - ", and the asterisks in the first list item\nhave become an" - "`emph`." - "### An algorithm for parsing nested emphasis and links" @@ -5629,8 +5628,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- whether the delimiter is a potential opener, a potential closer," - or both (which depends on what sort of characters precede - and follow the delimiters). -- "When we hit a `]` character, we call the *look for link" -- "or image*\nprocedure (see below)." +- "When we hit a `]` character, we call the" +- "*look for link or image*\nprocedure (see below)." - "When we hit the end of the input, we call the *process emphasis*" - "procedure (see below), with `stack_bottom` = NULL." - "#### *look for link or image*" @@ -5638,57 +5637,60 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "through the stack for an opening `[` or `![` delimiter." - "- If we don't find one, we return a literal text node `]" - "`." -- "- If we do find one, but it's not *active*, we remove" -- "the inactive\n delimiter from the stack, and return a literal text node" -- "`]`." -- "- If we find one and it's active, then we parse ahead to see" -- "if\n we have an inline link/image, reference link/image, collapsed reference" +- "-" +- "If we do find one, but it's not *active*" +- ", we remove the inactive" +- "delimiter from the stack, and return a literal text node `]`." +- "-" +- "If we find one and it's active, then we parse ahead to see if" +- "we have an inline link/image, reference link/image, collapsed reference" - "link/image, or shortcut reference link/image." -- "+ If we don't, then we remove the opening delimiter from the" +- + +- "If we don't, then we remove the opening delimiter from the" - "delimiter stack and return a literal text node `]`." - "+ If we do, then" -- "* We return a link or image node whose children are the inlines" +- "*" +- We return a link or image node whose children are the inlines - after the text node pointed to by the opening delimiter. -- "* We run *process emphasis* on these inlines, with the `[`" -- "opener\n as `stack_bottom`." -- "* We remove the opening delimiter." +- "*" +- "We run *process emphasis* on these inlines, with the `[` opener" +- "as `stack_bottom`.\n\n * We remove the opening delimiter." - "*" - "If we have a link (and not an image), we also set all" -- "`[` delimiters before the opening delimiter to *inactive*" -- ". (This\n will prevent us from getting links within links.)" +- "`[` delimiters before the opening delimiter to *inactive*. (This" +- will prevent us from getting links within links.) - "#### *process emphasis*" -- "Parameter `stack_bottom`" -- " sets a lower bound to how far we\ndescend in the [delimiter stack" -- "]. If it is NULL, we can" +- "Parameter `stack_bottom` sets a lower bound to how far we" +- "descend in the [delimiter stack]. If it is NULL, we can" - "go all the way to the bottom. Otherwise, we stop before" - "visiting `stack_bottom`." -- "Let `current_position`" -- " point to the element on the [delimiter stack]\njust above `stack_bottom`" -- " (or the first element if `stack_bottom`\nis NULL)." +- "Let `current_position` point to the element on the [delimiter stack]" +- "just above `stack_bottom` (or the first element if `stack_bottom`" +- is NULL). - "We keep track of the `openers_bottom` for each delimiter\ntype (" -- "`*`, `_`" -- "), indexed to the length of the closing delimiter run" +- "`*`, `_`), indexed to the length of the closing delimiter run" - (modulo 3) and to whether the closing delimiter can also be an - "opener. Initialize this to `stack_bottom`." - "Then we repeat the following until we run out of potential\nclosers:" - "-" -- "Move `current_position`" -- forward in the delimiter stack (if needed) -- "until we find the first potential closer with delimiter `*` or `_`" -- ".\n (This will be the potential closer closest" -- to the beginning of the input -- the first one in parse order.) +- "Move `current_position` forward in the delimiter stack (if needed)" +- "until we find the first potential closer with delimiter `*` or `_`." +- "(This will be the potential closer closest\n to the beginning of the input --" +- the first one in parse order.) - "-" -- "Now, look back in the stack (staying above `stack_bottom`" -- " and\n the `openers_bottom`" -- " for this delimiter type) for the\n first matching potential opener (\"matching\"" -- " means same delimiter).\n\n- If one is found:" -- "+ Figure out whether we have emphasis or strong emphasis:" +- "Now, look back in the stack (staying above `stack_bottom` and" +- "the `openers_bottom` for this delimiter type) for the" +- "first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:" +- + +- "Figure out whether we have emphasis or strong emphasis:" - "if both closer and opener spans have length >= 2, we have" - "strong, otherwise regular." -- "+ Insert an emph or strong emph node accordingly, after" +- + +- "Insert an emph or strong emph node accordingly, after" - the text node corresponding to the opener. -- + Remove any delimiters between the opener and closer from -- the delimiter stack. +- + +- "Remove any delimiters between the opener and closer from\n the delimiter stack." - + - Remove 1 (for regular emph) or 2 (for strong emph) - delimiters @@ -5698,16 +5700,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`current_position` to the next element in the stack." - "- If none is found:" - + -- "Set `openers_bottom` to the element before `current_position`" -- "." +- "Set `openers_bottom` to the element before `current_position`." - (We know that there are no openers for this kind of closer up to - and - "including this point, so this puts a lower bound on future searches.)" - + -- "If the closer at `current_position`" -- "is not a potential opener," +- "If the closer at `current_position` is not a potential opener," - "remove it from the delimiter stack (since we know it can't" - be a closer either). - "+ Advance `current_position` to the next element in the stack." -- "After we're done, we remove all delimiters above `stack_bottom` from" -- "the\ndelimiter stack." +- "After we're done, we remove all delimiters above `stack_bottom`" +- " from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap index 5fb9be0..358de17 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap @@ -6,7 +6,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3." +- "```" +- "1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3." - "Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item." - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------" @@ -18,10 +19,12 @@ input_file: tests/inputs/markdown/github_flavored.md - "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}" -- "echo URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" +- "```php" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME ." +- "URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" - "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" +- "```" +- "Colons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" - "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3" - "| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap index bbc7bae..6961f9b 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap @@ -4,7 +4,8 @@ expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers" -- "```\n# h1 Heading 8-)\n## h2 Heading" +- "```" +- "# h1 Heading 8-)\n## h2 Heading" - "### h3 Heading\n#### h4 Heading\n##### h5 Heading" - "###### h6 Heading" - "Alternatively, for H1 and H2, an underline-ish style:" @@ -36,7 +37,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "*This is italic text*\n\n_This is italic text_" - "~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item" +- "```" +- "1. First ordered list item\n2. Another item" - "⋅⋅* Unordered sub-list.\n1." - "Actual numbers don't matter, just that it's a number" - "⋅⋅1. Ordered sub-list\n4." @@ -83,11 +85,12 @@ input_file: tests/inputs/markdown/github_flavored.md - "GFM line break behaviour, where trailing spaces are not required.)" - "* Unordered list can use asterisks\n- Or minuses" - + Or pluses -- "1. Make my changes\n 1. Fix bug" +- 1. Make my changes +- 1. Fix bug - " 2. Improve formatting\n - Make the headings bigger" - "2. Push my commits to GitHub\n3. Open a pull request" -- " * Describe my changes\n * Mention all the members of my team" -- "* Ask for feedback" +- "* Describe my changes" +- " * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `" - "*`\n+ Sub-lists are made by indenting 2 spaces:" - "- Marker character change forces new list start:" @@ -95,8 +98,9 @@ input_file: tests/inputs/markdown/github_flavored.md - + Facilisis in pretium nisl aliquet - "- Nulla volutpat aliquam velit\n+ Very easy!\n\n------" - "# Task lists" -- "```\n- [x] Finish my changes" -- "- [ ] Push my commits to GitHub\n- [ ] Open a pull request" +- "```" +- "- [x] Finish my changes\n- [ ] Push my commits to GitHub" +- "- [ ] Open a pull request" - "- [x] @mentions, #refs, [links](), **" - "formatting**, and tags supported" - "- [x] list syntax required (any unordered or ordered list supported)" @@ -104,8 +108,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [ ] this is an incomplete item\n```" - "- [x] Finish my changes\n- [ ] Push my commits to GitHub" - "- [ ] Open a pull request" -- "- [x] @mentions, #refs, [links](), **" -- "formatting**, and tags supported" +- "-" +- "[x] @mentions, #refs, [links](), **formatting**" +- ", and tags supported" - "- [x] list syntax required (any unordered or ordered list supported)" - "- [ ] this is a complete item" - "- [ ] this is an incomplete item\n\n------" @@ -118,7 +123,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Let's rename \\*our-new-project\\* to \\*our-old-project" - "\\*.\n\n------" - "# Links" -- "```\n[I'm an inline-style link](https://www.google.com)" +- "```" +- "[I'm an inline-style link](https://www.google.com)" - "[I'm an inline-style link with title](https://www.google.com \"" - "Google's Homepage\")" - "[I'm a reference-style link][Arbitrary case-insensitive reference text]" @@ -140,15 +146,15 @@ input_file: tests/inputs/markdown/github_flavored.md - "[You can use numbers for reference-style link definitions][1]" - "Or leave it empty and use the [link text itself]." - URLs and URLs in angle brackets will automatically get turned into links. -- "http://www.example.com or " -- " and sometimes\nexample.com (but not on Github, for example)." +- "http://www.example.com or and sometimes" +- "example.com (but not on Github, for example)." - Some text to show that the reference links can follow later. - "[arbitrary case-insensitive reference text]: https://www.mozilla.org" - "[1]: http://slashdot.org" - "[link text itself]: http://www.reddit.com\n\n------" - "# Images" -- "```\nHere's our logo (hover to see the title text):" -- "Inline-style:\n![" +- "```" +- "Here's our logo (hover to see the title text):\n\nInline-style:\n![" - "alt text](https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:" - "![alt text][logo]" @@ -165,29 +171,32 @@ input_file: tests/inputs/markdown/github_flavored.md - "Here's our logo (hover to see the title text):" - "Inline-style:" - "![" -- "alt text](https://github.com/adam-p/markdown-here/raw/master" -- "/src/common/images/icon48.png \"Logo Title Text 1\")" +- alt text +- "](https://github.com/adam-p/markdown-here/raw/master/src/common" +- "/images/icon48.png \"Logo Title Text 1\")" - "Reference-style:\n![alt text][logo]" - "[logo]: https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 2\"" - "![Minion](https://octodex.github.com/images/minion.png)" - "![" -- "Stormtroopocat](https://octodex.github.com/images/" -- "stormtroopocat.jpg \"The Stormtroopocat\")" +- Stormtroopocat +- "](https://octodex.github.com/images/stormtroopocat.jpg" +- "\"The Stormtroopocat\")" - "Like links, Images also have a footnote style syntax" - "![Alt text][id]" - "With a reference later in the document defining the URL location:" - "[id]: https://octodex.github.com/images/dojocat.jpg" - "\"The Dojocat\"\n\n------" -- "# [Footnotes](https://github.com/markdown-it/markdown-it-" -- footnote) -- "```\nFootnote 1 link[^first]." -- "Footnote 2 link[^second]." +- "#" +- "[Footnotes](https://github.com/markdown-it/markdown-it-footnote" +- ) +- "```" +- "Footnote 1 link[^first].\n\nFootnote 2 link[^second]." - "Inline footnote^[Text of inline footnote] definition." - "Duplicated footnote reference[^second]." - "[^first]: Footnote **can have markup**\n\n and multiple paragraphs." -- "[^second]: Footnote text.\n```\n\nFootnote 1 link" -- "[^first].\n\nFootnote 2 link[^second]." +- "[^second]: Footnote text.\n```" +- "Footnote 1 link[^first].\n\nFootnote 2 link[^second]." - "Inline footnote^[Text of inline footnote] definition." - "Duplicated footnote reference[^second]." - "[^first]: Footnote **can have markup**\n\n and multiple paragraphs." @@ -195,21 +204,24 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Code and Syntax Highlighting" - "```\nInline `code` has `back-ticks around` it.\n```" - "Inline `code` has `back-ticks around` it." -- "```c#\nusing System.IO.Compression;" -- "#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{" -- " [Obsolete(\"...\")]\n class Program : IInterface\n {" +- "```c#" +- "using System.IO.Compression;\n\n#pragma warning disable 414, 3021" +- "namespace MyApplication\n{\n [Obsolete(\"...\")]" +- " class Program : IInterface\n {" - " public static List JustDoIt(int count)\n {" - "Console.WriteLine($\"Hello {Name}!\");" - "return new List(new int[] { 1, 2," - "3 })\n }\n }\n}\n```" -- "```css\n@font-face {" +- "```css" +- "@font-face {" - "font-family: Chunkfive; src: url('Chunkfive.otf');" - "}\n\nbody, .usertext {" - "color: #F0F0F0; background: #600;" - " font-family: Chunkfive, sans;\n}\n\n@import url(print.css);" - "@media print {\n a[href^=http]::after {" - " content: attr(href)\n }\n}\n```" -- "```javascript\nfunction $initHighlight(block, cls) {\n try {" +- "```javascript" +- "function $initHighlight(block, cls) {\n try {" - "if (cls.search(/\\bno\\-highlight\\b/) != -1)" - "return process(block, true, 0x0F) +" - " ` class=\"${cls}\"`;\n } catch (e) {" @@ -217,10 +229,10 @@ input_file: tests/inputs/markdown/github_flavored.md - for (var i = 0 / 2; i < classes.length; - "i++) {\n if (checkCondition(classes[i]) === undefined)" - " console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "```php\nrequire_once 'Zend/Uri/Http.php';" -- "namespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}" -- "abstract class URI extends BaseURI implements Factory\n{\n abstract function test();" -- public static $st1 = 1; +- "```php" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory" +- "{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory" +- "{\n abstract function test();\n\n public static $st1 = 1;" - " const ME = \"Yo\";\n var $list = NULL;" - " private $var;\n\n /**\n * Returns a URI\n *" - " * @return URI\n */" @@ -241,7 +253,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere" - "```\n\n------" - "# Tables" -- "```\nColons can be used to align columns." +- "```" +- Colons can be used to align columns. - "| Tables | Are | Cool |" - "| ------------- |:-------------:| -----:|" - "| col 3 is | right-aligned | $1600 |" @@ -268,14 +281,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n| --- | --- |" - "| Backtick | ` |\n| Pipe | \\| |\n```" - Colons can be used to align columns. -- "| Tables | Are | Cool |" -- "| ------------- |:-------------:| -----:|" +- "| Tables | Are | Cool" +- "|\n| ------------- |:-------------:| -----:|" - "| col 3 is | right-aligned | $1600 |" - "| col 2 is | centered | $12 |" - "| zebra stripes | are neat | $1 |" - There must be at least 3 dashes separating each header cell. -- "The outer pipes (|) are optional, and you don't need to make" -- the +- "The outer pipes (|) are optional, and you don'" +- t need to make the - raw Markdown line up prettily. You can also use inline Markdown. - "Markdown | Less | Pretty\n--- | --- | ---" - "*Still* | `renders` | **nicely**" @@ -290,14 +303,15 @@ input_file: tests/inputs/markdown/github_flavored.md - "| `git status` | List all *new or modified* files |" - "| `git diff` | Show file differences that **haven't been** staged" - "|" -- "| Left-aligned | Center-aligned | Right-aligned |" -- "| :--- | :---: | ---: |" +- "| Left-aligned | Center-aligned | Right-aligned" +- "|\n| :--- | :---: | ---: |" - "| git status | git status | git status |" - "| git diff | git diff | git diff |" - "| Name | Character |\n| --- | --- |" - "| Backtick | ` |\n| Pipe | \\| |\n\n------" - "# Blockquotes" -- "```\n> Blockquotes are very handy in email to emulate reply text." +- "```" +- "> Blockquotes are very handy in email to emulate reply text." - "> This line is part of the same quote.\n\nQuote break." - "> This is a very long line that will still be quoted properly when it wraps" - "." @@ -307,10 +321,12 @@ input_file: tests/inputs/markdown/github_flavored.md - "> Blockquotes can also be nested..." - ">> ...by using additional greater-than signs right next to each other..." - "> > > ...or with spaces between arrows.\n```" -- "> Blockquotes are very handy in email to emulate reply text.\n>" +- ">" +- "Blockquotes are very handy in email to emulate reply text.\n>" - "This line is part of the same quote.\n\nQuote break." -- "> This is a very long line that will still be quoted properly when it wraps" -- ". Oh boy let'" +- ">" +- This is a very long line that will still be quoted properly when it wraps. +- "Oh boy let'" - s keep writing to make sure this is long enough to actually wrap for everyone. - "Oh, you can *put* **Markdown** into a blockquote." - "> Blockquotes can also be nested...\n>" @@ -352,7 +368,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "IMAGE ALT TEXT HERE](http://img.youtube.com/vi/" - "YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com" - "/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```" -- "[![IMAGE ALT TEXT HERE" +- "[![" +- IMAGE ALT TEXT HERE - "](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/" - YouTube_logo_2015.svg/1200px-YouTube_logo_2015 - ".svg.png)](https://www.youtube.com/watch?"