diff --git a/Cargo.toml b/Cargo.toml index 596b7db..e2315af 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,6 +29,7 @@ rustdoc-args = ["--cfg", "docsrs"] # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +ahash = "0.8.11" auto_enums = "0.8.5" either = "1.10.0" itertools = "0.12.1" diff --git a/benches/output.txt b/benches/output.txt index 6bf8a61..178813a 100644 --- a/benches/output.txt +++ b/benches/output.txt @@ -1,474 +1,474 @@ -running 67 tests -iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii -test result: ok. 0 passed; 0 failed; 67 ignored; 0 measured; 0 filtered out; finished in 0.00s +running 70 tests +iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiiii +test result: ok. 0 passed; 0 failed; 70 ignored; 0 measured; 0 filtered out; finished in 0.00s chunk_size fastest │ slowest │ median │ mean │ samples │ iters ├─ markdown │ │ │ │ │ │ ├─ characters │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 331.1 ms │ 363 ms │ 332.2 ms │ 332.6 ms │ 100 │ 100 -│ │ │ 619.1 KB/s │ 564.7 KB/s │ 617.1 KB/s │ 616.3 KB/s │ │ +│ │ │ ╰─ commonmark_spec 297.3 ms │ 322.7 ms │ 300.7 ms │ 301.3 ms │ 100 │ 100 +│ │ │ 689.6 KB/s │ 635.1 KB/s │ 681.7 KB/s │ 680.4 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 22093 │ 0 │ 22093 │ 21872 │ │ -│ │ │ 77.31 MB │ 0 B │ 77.31 MB │ 76.54 MB │ │ +│ │ │ 14088 │ 0 │ 14088 │ 13947 │ │ +│ │ │ 80.3 MB │ 0 B │ 80.3 MB │ 79.5 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 22094 │ 0 │ 22094 │ 21873 │ │ -│ │ │ 283.2 MB │ 0 B │ 283.2 MB │ 280.3 MB │ │ +│ │ │ 14089 │ 0 │ 14089 │ 13948 │ │ +│ │ │ 297.7 MB │ 0 B │ 297.7 MB │ 294.7 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 47400 │ 0 │ 47400 │ 46926 │ │ -│ │ │ 205.7 MB │ 0 B │ 205.7 MB │ 203.6 MB │ │ +│ │ │ 47478 │ 0 │ 47478 │ 47003 │ │ +│ │ │ 217.2 MB │ 0 B │ 217.2 MB │ 215 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 0 │ 13 │ 12.87 │ │ │ │ │ 94 B │ 0 B │ 94 B │ 93.06 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 46.24 ms │ 47.33 ms │ 46.48 ms │ 46.49 ms │ 100 │ 100 -│ │ │ 4.433 MB/s │ 4.331 MB/s │ 4.41 MB/s │ 4.41 MB/s │ │ +│ │ │ ╰─ commonmark_spec 37.85 ms │ 39.52 ms │ 38.56 ms │ 38.65 ms │ 100 │ 100 +│ │ │ 5.416 MB/s │ 5.187 MB/s │ 5.316 MB/s │ 5.304 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 2599 │ 2599 │ 2599 │ 2599 │ │ -│ │ │ 9.381 MB │ 9.381 MB │ 9.381 MB │ 9.381 MB │ │ +│ │ │ 1614 │ 1614 │ 1614 │ 1614 │ │ +│ │ │ 9.472 MB │ 9.472 MB │ 9.472 MB │ 9.472 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 2600 │ 2600 │ 2600 │ 2600 │ │ -│ │ │ 34.7 MB │ 34.7 MB │ 34.7 MB │ 34.7 MB │ │ +│ │ │ 1615 │ 1615 │ 1615 │ 1615 │ │ +│ │ │ 35.51 MB │ 35.51 MB │ 35.51 MB │ 35.51 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 5607 │ 5607 │ 5607 │ 5607 │ │ -│ │ │ 25.12 MB │ 25.12 MB │ 25.12 MB │ 25.12 MB │ │ +│ │ │ 5382 │ 5382 │ 5382 │ 5382 │ │ +│ │ │ 25.84 MB │ 25.84 MB │ 25.84 MB │ 25.84 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 8.622 ms │ 8.967 ms │ 8.651 ms │ 8.666 ms │ 100 │ 100 -│ │ │ 23.77 MB/s │ 22.86 MB/s │ 23.69 MB/s │ 23.65 MB/s │ │ +│ │ │ ╰─ commonmark_spec 6.729 ms │ 7.102 ms │ 6.82 ms │ 6.837 ms │ 100 │ 100 +│ │ │ 30.46 MB/s │ 28.86 MB/s │ 30.06 MB/s │ 29.98 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 406 │ 406 │ 406 │ 406 │ │ -│ │ │ 1.701 MB │ 1.701 MB │ 1.701 MB │ 1.701 MB │ │ +│ │ │ 260 │ 260 │ 260 │ 260 │ │ +│ │ │ 1.663 MB │ 1.663 MB │ 1.663 MB │ 1.663 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 407 │ 407 │ 407 │ 407 │ │ -│ │ │ 6.409 MB │ 6.409 MB │ 6.409 MB │ 6.409 MB │ │ +│ │ │ 261 │ 261 │ 261 │ 261 │ │ +│ │ │ 6.346 MB │ 6.346 MB │ 6.346 MB │ 6.346 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 901 │ 901 │ 901 │ 901 │ │ -│ │ │ 4.503 MB │ 4.503 MB │ 4.503 MB │ 4.503 MB │ │ +│ │ │ 812 │ 812 │ 812 │ 812 │ │ +│ │ │ 4.478 MB │ 4.478 MB │ 4.478 MB │ 4.478 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 2.086 ms │ 2.303 ms │ 2.099 ms │ 2.111 ms │ 100 │ 100 -│ │ 98.26 MB/s │ 89.02 MB/s │ 97.67 MB/s │ 97.09 MB/s │ │ +│ │ ╰─ commonmark_spec 1.716 ms │ 1.931 ms │ 1.772 ms │ 1.774 ms │ 100 │ 100 +│ │ 119.4 MB/s │ 106.1 MB/s │ 115.6 MB/s │ 115.5 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 81 │ 81 │ 81 │ 81 │ │ -│ │ 529.3 KB │ 529.3 KB │ 529.3 KB │ 529.3 KB │ │ +│ │ 65 │ 65 │ 65 │ 65 │ │ +│ │ 528.4 KB │ 528.4 KB │ 528.4 KB │ 528.4 KB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 82 │ 82 │ 82 │ 82 │ │ -│ │ 2.123 MB │ 2.123 MB │ 2.123 MB │ 2.123 MB │ │ +│ │ 66 │ 66 │ 66 │ 66 │ │ +│ │ 2.122 MB │ 2.122 MB │ 2.122 MB │ 2.122 MB │ │ │ │ grow: │ │ │ │ │ -│ │ 150 │ 150 │ 150 │ 150 │ │ +│ │ 148 │ 148 │ 148 │ 148 │ │ │ │ 1.388 MB │ 1.388 MB │ 1.388 MB │ 1.388 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ tiktoken │ │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 867.2 ms │ 955.8 ms │ 923.7 ms │ 909.9 ms │ 100 │ 100 -│ │ │ 236.4 KB/s │ 214.4 KB/s │ 221.9 KB/s │ 225.3 KB/s │ │ +│ │ │ ╰─ commonmark_spec 838.2 ms │ 863.3 ms │ 844.7 ms │ 847.4 ms │ 100 │ 100 +│ │ │ 244.5 KB/s │ 237.4 KB/s │ 242.6 KB/s │ 241.9 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 8672718 │ 8672718 │ 8672718 │ 8672718 │ │ -│ │ │ 420.8 MB │ 420.8 MB │ 420.8 MB │ 420.8 MB │ │ +│ │ │ 8080680 │ 8080680 │ 8080680 │ 8080680 │ │ +│ │ │ 394.7 MB │ 394.7 MB │ 394.7 MB │ 394.7 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 8984226 │ 8984226 │ 8984226 │ 8984226 │ │ -│ │ │ 789.1 MB │ 789.1 MB │ 789.1 MB │ 789.1 MB │ │ +│ │ │ 8392188 │ 8392188 │ 8392188 │ 8392188 │ │ +│ │ │ 748.8 MB │ 748.8 MB │ 748.8 MB │ 748.8 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 1576174 │ 1576174 │ 1576174 │ 1576174 │ │ -│ │ │ 349.7 MB │ 349.7 MB │ 349.7 MB │ 349.7 MB │ │ +│ │ │ 1466095 │ 1466095 │ 1466095 │ 1466095 │ │ +│ │ │ 335.6 MB │ 335.6 MB │ 335.6 MB │ 335.6 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 512 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 304.4 ms │ 314.1 ms │ 309.9 ms │ 309.4 ms │ 100 │ 100 -│ │ │ 673.4 KB/s │ 652.7 KB/s │ 661.4 KB/s │ 662.6 KB/s │ │ +│ │ │ ╰─ commonmark_spec 287.9 ms │ 291 ms │ 288.6 ms │ 288.8 ms │ 100 │ 100 +│ │ │ 711.9 KB/s │ 704.3 KB/s │ 710.1 KB/s │ 709.8 KB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 3289368 │ 3289368 │ 3289368 │ 3289368 │ │ -│ │ │ 154 MB │ 154 MB │ 154 MB │ 154 MB │ │ +│ │ │ 2956753 │ 2956753 │ 2956753 │ 2956753 │ │ +│ │ │ 138.6 MB │ 138.6 MB │ 138.6 MB │ 138.6 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 3600876 │ 3600876 │ 3600876 │ 3600876 │ │ -│ │ │ 286.6 MB │ 286.6 MB │ 286.6 MB │ 286.6 MB │ │ +│ │ │ 3268261 │ 3268261 │ 3268261 │ 3268261 │ │ +│ │ │ 261.3 MB │ 261.3 MB │ 261.3 MB │ 261.3 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 615911 │ 615911 │ 615911 │ 615911 │ │ -│ │ │ 114.1 MB │ 114.1 MB │ 114.1 MB │ 114.1 MB │ │ +│ │ │ 551196 │ 551196 │ 551196 │ 551196 │ │ +│ │ │ 104.1 MB │ 104.1 MB │ 104.1 MB │ 104.1 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ├─ 4096 │ │ │ │ │ -│ │ │ ╰─ commonmark_spec 189 ms │ 196.6 ms │ 189.9 ms │ 190.3 ms │ 100 │ 100 -│ │ │ 1.084 MB/s │ 1.042 MB/s │ 1.079 MB/s │ 1.076 MB/s │ │ +│ │ │ ╰─ commonmark_spec 157 ms │ 159.7 ms │ 157.5 ms │ 157.5 ms │ 100 │ 100 +│ │ │ 1.305 MB/s │ 1.283 MB/s │ 1.301 MB/s │ 1.301 MB/s │ │ │ │ │ alloc: │ │ │ │ │ -│ │ │ 2065547 │ 2065547 │ 2065547 │ 2065547 │ │ -│ │ │ 95.46 MB │ 95.46 MB │ 95.46 MB │ 95.46 MB │ │ +│ │ │ 1651804 │ 1651804 │ 1651804 │ 1651804 │ │ +│ │ │ 76.5 MB │ 76.5 MB │ 76.5 MB │ 76.5 MB │ │ │ │ │ dealloc: │ │ │ │ │ -│ │ │ 2377055 │ 2377055 │ 2377055 │ 2377055 │ │ -│ │ │ 182.4 MB │ 182.4 MB │ 182.4 MB │ 182.4 MB │ │ +│ │ │ 1963312 │ 1963312 │ 1963312 │ 1963312 │ │ +│ │ │ 150.6 MB │ 150.6 MB │ 150.6 MB │ 150.6 MB │ │ │ │ │ grow: │ │ │ │ │ -│ │ │ 384264 │ 384264 │ 384264 │ 384264 │ │ -│ │ │ 68.42 MB │ 68.42 MB │ 68.42 MB │ 68.42 MB │ │ +│ │ │ 308264 │ 308264 │ 308264 │ 308264 │ │ +│ │ │ 55.62 MB │ 55.62 MB │ 55.62 MB │ 55.62 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ │ ╰─ 32768 │ │ │ │ │ -│ │ ╰─ commonmark_spec 82.27 ms │ 84.55 ms │ 82.57 ms │ 82.7 ms │ 100 │ 100 -│ │ 2.491 MB/s │ 2.424 MB/s │ 2.482 MB/s │ 2.479 MB/s │ │ +│ │ ╰─ commonmark_spec 73.1 ms │ 74.41 ms │ 73.83 ms │ 73.77 ms │ 100 │ 100 +│ │ 2.804 MB/s │ 2.755 MB/s │ 2.776 MB/s │ 2.779 MB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 878315 │ 878315 │ 878315 │ 878315 │ │ -│ │ 40.83 MB │ 40.83 MB │ 40.83 MB │ 40.83 MB │ │ +│ │ 750031 │ 750031 │ 750031 │ 750031 │ │ +│ │ 34.96 MB │ 34.96 MB │ 34.96 MB │ 34.96 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 1189823 │ 1189823 │ 1189823 │ 1189823 │ │ -│ │ 88.71 MB │ 88.71 MB │ 88.71 MB │ 88.71 MB │ │ +│ │ 1061539 │ 1061539 │ 1061539 │ 1061539 │ │ +│ │ 78.87 MB │ 78.87 MB │ 78.87 MB │ 78.87 MB │ │ │ │ grow: │ │ │ │ │ -│ │ 165295 │ 165295 │ 165295 │ 165295 │ │ -│ │ 29.35 MB │ 29.35 MB │ 29.35 MB │ 29.35 MB │ │ +│ │ 141689 │ 141689 │ 141689 │ 141689 │ │ +│ │ 25.39 MB │ 25.39 MB │ 25.39 MB │ 25.39 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ tokenizers │ │ │ │ │ │ ├─ 64 │ │ │ │ │ -│ │ ╰─ commonmark_spec 1.609 s │ 1.769 s │ 1.631 s │ 1.637 s │ 100 │ 100 -│ │ 127.4 KB/s │ 115.8 KB/s │ 125.6 KB/s │ 125.2 KB/s │ │ +│ │ ╰─ commonmark_spec 1.467 s │ 1.537 s │ 1.48 s │ 1.482 s │ 100 │ 100 +│ │ 139.6 KB/s │ 133.3 KB/s │ 138.5 KB/s │ 138.2 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 41344053 │ 41344053 │ 41344053 │ 41344053 │ │ -│ │ 3.939 GB │ 3.939 GB │ 3.939 GB │ 3.939 GB │ │ +│ │ 36199855 │ 36199855 │ 36199855 │ 36199855 │ │ +│ │ 3.467 GB │ 3.467 GB │ 3.467 GB │ 3.467 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 41402106 │ 41402106 │ 41402106 │ 41402106 │ │ -│ │ 6.152 GB │ 6.152 GB │ 6.152 GB │ 6.152 GB │ │ +│ │ 36257908 │ 36257908 │ 36257908 │ 36257908 │ │ +│ │ 5.442 GB │ 5.442 GB │ 5.442 GB │ 5.442 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 1566195 │ 1566195 │ 1566195 │ 1566195 │ │ -│ │ 2.208 GB │ 2.208 GB │ 2.208 GB │ 2.208 GB │ │ +│ │ 1259076 │ 1259076 │ 1259076 │ 1259076 │ │ +│ │ 1.969 GB │ 1.969 GB │ 1.969 GB │ 1.969 GB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 512 │ │ │ │ │ -│ │ ╰─ commonmark_spec 674 ms │ 705.3 ms │ 685.2 ms │ 685.4 ms │ 100 │ 100 -│ │ 304.1 KB/s │ 290.6 KB/s │ 299.1 KB/s │ 299 KB/s │ │ +│ │ ╰─ commonmark_spec 615.1 ms │ 702 ms │ 623.1 ms │ 630.2 ms │ 100 │ 100 +│ │ 333.2 KB/s │ 292 KB/s │ 329 KB/s │ 325.3 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 17769736 │ 17769736 │ 17769736 │ 17769736 │ │ -│ │ 1.747 GB │ 1.747 GB │ 1.747 GB │ 1.747 GB │ │ +│ │ 16094165 │ 16094165 │ 16094165 │ 16094165 │ │ +│ │ 1.579 GB │ 1.579 GB │ 1.579 GB │ 1.579 GB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 17827789 │ 17827789 │ 17827789 │ 17827789 │ │ -│ │ 2.691 GB │ 2.691 GB │ 2.691 GB │ 2.691 GB │ │ +│ │ 16152218 │ 16152218 │ 16152218 │ 16152218 │ │ +│ │ 2.44 GB │ 2.44 GB │ 2.44 GB │ 2.44 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 479024 │ 479024 │ 479024 │ 479024 │ │ -│ │ 939.5 MB │ 939.5 MB │ 939.5 MB │ 939.5 MB │ │ +│ │ 398917 │ 398917 │ 398917 │ 398917 │ │ +│ │ 855.5 MB │ 855.5 MB │ 855.5 MB │ 855.5 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ├─ 4096 │ │ │ │ │ -│ │ ╰─ commonmark_spec 400.8 ms │ 458 ms │ 402.7 ms │ 409.6 ms │ 100 │ 100 -│ │ 511.4 KB/s │ 447.5 KB/s │ 509 KB/s │ 500.4 KB/s │ │ +│ │ ╰─ commonmark_spec 320.8 ms │ 329.6 ms │ 322.5 ms │ 323.9 ms │ 100 │ 100 +│ │ 639 KB/s │ 621.9 KB/s │ 635.6 KB/s │ 632.7 KB/s │ │ │ │ alloc: │ │ │ │ │ -│ │ 10615708 │ 10615708 │ 10615708 │ 10615708 │ │ -│ │ 1.053 GB │ 1.053 GB │ 1.053 GB │ 1.053 GB │ │ +│ │ 8490533 │ 8490533 │ 8490533 │ 8490533 │ │ +│ │ 843.2 MB │ 843.2 MB │ 843.2 MB │ 843.2 MB │ │ │ │ dealloc: │ │ │ │ │ -│ │ 10673761 │ 10673761 │ 10673761 │ 10673761 │ │ -│ │ 1.61 GB │ 1.61 GB │ 1.61 GB │ 1.61 GB │ │ +│ │ 8548586 │ 8548586 │ 8548586 │ 8548586 │ │ +│ │ 1.292 GB │ 1.292 GB │ 1.292 GB │ 1.292 GB │ │ │ │ grow: │ │ │ │ │ -│ │ 217813 │ 217813 │ 217813 │ 217813 │ │ -│ │ 551.7 MB │ 551.7 MB │ 551.7 MB │ 551.7 MB │ │ +│ │ 167879 │ 167879 │ 167879 │ 167879 │ │ +│ │ 444.2 MB │ 444.2 MB │ 444.2 MB │ 444.2 MB │ │ │ │ shrink: │ │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ │ ╰─ 32768 │ │ │ │ │ -│ ╰─ commonmark_spec 196.7 ms │ 221.7 ms │ 203 ms │ 203.9 ms │ 100 │ 100 -│ 1.041 MB/s │ 924.3 KB/s │ 1.009 MB/s │ 1.005 MB/s │ │ +│ ╰─ commonmark_spec 176.3 ms │ 183.5 ms │ 177.6 ms │ 179 ms │ 100 │ 100 +│ 1.162 MB/s │ 1.116 MB/s │ 1.154 MB/s │ 1.145 MB/s │ │ │ alloc: │ │ │ │ │ -│ 5204030 │ 5204030 │ 5204030 │ 5204030 │ │ -│ 522.4 MB │ 522.4 MB │ 522.4 MB │ 522.4 MB │ │ +│ 4579500 │ 4579500 │ 4579500 │ 4579500 │ │ +│ 460.2 MB │ 460.2 MB │ 460.2 MB │ 460.2 MB │ │ │ dealloc: │ │ │ │ │ -│ 5262083 │ 5262083 │ 5262083 │ 5262083 │ │ -│ 793.2 MB │ 793.2 MB │ 793.2 MB │ 793.2 MB │ │ +│ 4637553 │ 4637553 │ 4637553 │ 4637553 │ │ +│ 698.5 MB │ 698.5 MB │ 698.5 MB │ 698.5 MB │ │ │ grow: │ │ │ │ │ -│ 91803 │ 91803 │ 91803 │ 91803 │ │ -│ 265.8 MB │ 265.8 MB │ 265.8 MB │ 265.8 MB │ │ +│ 79534 │ 79534 │ 79534 │ 79534 │ │ +│ 233.4 MB │ 233.4 MB │ 233.4 MB │ 233.4 MB │ │ │ shrink: │ │ │ │ │ │ 13 │ 13 │ 13 │ 13 │ │ │ 94 B │ 94 B │ 94 B │ 94 B │ │ ╰─ text │ │ │ │ │ ├─ characters │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 213 ms │ 218.8 ms │ 214.4 ms │ 214.6 ms │ 100 │ 100 - │ │ │ 768 KB/s │ 747.6 KB/s │ 763 KB/s │ 762.2 KB/s │ │ + │ │ ├─ romeo_and_juliet 195.7 ms │ 197.7 ms │ 196.2 ms │ 196.3 ms │ 100 │ 100 + │ │ │ 835.9 KB/s │ 827.4 KB/s │ 833.7 KB/s │ 833.4 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 18663 │ 18939 │ 18663 │ 18665 │ │ - │ │ │ 33.41 MB │ 33.44 MB │ 33.41 MB │ 33.41 MB │ │ + │ │ │ 11187 │ 11463 │ 11187 │ 11189 │ │ + │ │ │ 32.32 MB │ 32.34 MB │ 32.32 MB │ 32.32 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 18664 │ 18860 │ 18664 │ 18665 │ │ - │ │ │ 123.2 MB │ 123.2 MB │ 123.2 MB │ 123.2 MB │ │ + │ │ │ 11188 │ 11384 │ 11188 │ 11189 │ │ + │ │ │ 121.8 MB │ 121.8 MB │ 121.8 MB │ 121.8 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 34910 │ 34949 │ 34910 │ 34910 │ │ - │ │ │ 89.66 MB │ 89.67 MB │ 89.66 MB │ 89.66 MB │ │ + │ │ │ 33447 │ 33486 │ 33447 │ 33447 │ │ + │ │ │ 89.36 MB │ 89.37 MB │ 89.36 MB │ 89.36 MB │ │ │ │ │ shrink: │ │ │ │ │ │ │ │ 0 │ 5 │ 0 │ 0.05 │ │ │ │ │ 0 B │ 2.34 KB │ 0 B │ 23.4 B │ │ - │ │ ╰─ room_with_a_view 161.4 ms │ 166.1 ms │ 161.8 ms │ 162.7 ms │ 100 │ 100 - │ │ 1.869 MB/s │ 1.817 MB/s │ 1.865 MB/s │ 1.855 MB/s │ │ + │ │ ╰─ room_with_a_view 159.8 ms │ 176.9 ms │ 170.7 ms │ 167.2 ms │ 100 │ 100 + │ │ 1.888 MB/s │ 1.705 MB/s │ 1.767 MB/s │ 1.805 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 30805 │ 30805 │ 30805 │ 30805 │ │ - │ │ 29.33 MB │ 29.33 MB │ 29.33 MB │ 29.33 MB │ │ + │ │ 18429 │ 18429 │ 18429 │ 18429 │ │ + │ │ 26.32 MB │ 26.32 MB │ 26.32 MB │ 26.32 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 30806 │ 30806 │ 30806 │ 30806 │ │ - │ │ 97.49 MB │ 97.49 MB │ 97.49 MB │ 97.49 MB │ │ + │ │ 18430 │ 18430 │ 18430 │ 18430 │ │ + │ │ 92.81 MB │ 92.81 MB │ 92.81 MB │ 92.81 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 56128 │ 56128 │ 56128 │ 56128 │ │ - │ │ 67.85 MB │ 67.85 MB │ 67.85 MB │ 67.85 MB │ │ + │ │ 48813 │ 48813 │ 48813 │ 48813 │ │ + │ │ 66.19 MB │ 66.19 MB │ 66.19 MB │ 66.19 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 24.93 ms │ 25.83 ms │ 25.02 ms │ 25.04 ms │ 100 │ 100 - │ │ │ 6.561 MB/s │ 6.332 MB/s │ 6.538 MB/s │ 6.531 MB/s │ │ + │ │ ├─ romeo_and_juliet 24.92 ms │ 25.57 ms │ 25.03 ms │ 25.04 ms │ 100 │ 100 + │ │ │ 6.564 MB/s │ 6.397 MB/s │ 6.536 MB/s │ 6.532 MB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 1995 │ 1995 │ 1995 │ 1995 │ │ - │ │ │ 3.712 MB │ 3.712 MB │ 3.712 MB │ 3.712 MB │ │ + │ │ │ 1199 │ 1199 │ 1199 │ 1199 │ │ + │ │ │ 3.479 MB │ 3.479 MB │ 3.479 MB │ 3.479 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 1996 │ 1996 │ 1996 │ 1996 │ │ - │ │ │ 13.97 MB │ 13.97 MB │ 13.97 MB │ 13.97 MB │ │ + │ │ │ 1200 │ 1200 │ 1200 │ 1200 │ │ + │ │ │ 13.58 MB │ 13.58 MB │ 13.58 MB │ 13.58 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 4318 │ 4318 │ 4318 │ 4318 │ │ - │ │ │ 10.09 MB │ 10.09 MB │ 10.09 MB │ 10.09 MB │ │ - │ │ ╰─ room_with_a_view 25.41 ms │ 26.28 ms │ 25.51 ms │ 25.53 ms │ 100 │ 100 - │ │ 11.87 MB/s │ 11.48 MB/s │ 11.83 MB/s │ 11.82 MB/s │ │ + │ │ │ 3593 │ 3593 │ 3593 │ 3593 │ │ + │ │ │ 9.941 MB │ 9.941 MB │ 9.941 MB │ 9.941 MB │ │ + │ │ ╰─ room_with_a_view 26.07 ms │ 28.08 ms │ 26.21 ms │ 26.24 ms │ 100 │ 100 + │ │ 11.57 MB/s │ 10.75 MB/s │ 11.51 MB/s │ 11.5 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 3918 │ 3918 │ 3918 │ 3918 │ │ - │ │ 3.647 MB │ 3.647 MB │ 3.647 MB │ 3.647 MB │ │ + │ │ 2348 │ 2348 │ 2348 │ 2348 │ │ + │ │ 3.353 MB │ 3.353 MB │ 3.353 MB │ 3.353 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 3919 │ 3919 │ 3919 │ 3919 │ │ - │ │ 12.6 MB │ 12.6 MB │ 12.6 MB │ 12.6 MB │ │ + │ │ 2349 │ 2349 │ 2349 │ 2349 │ │ + │ │ 12.17 MB │ 12.17 MB │ 12.17 MB │ 12.17 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 7000 │ 7000 │ 7000 │ 7000 │ │ - │ │ 8.658 MB │ 8.658 MB │ 8.658 MB │ 8.658 MB │ │ + │ │ 6217 │ 6217 │ 6217 │ 6217 │ │ + │ │ 8.522 MB │ 8.522 MB │ 8.522 MB │ 8.522 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 3.721 ms │ 4.042 ms │ 3.763 ms │ 3.773 ms │ 100 │ 100 - │ │ │ 43.97 MB/s │ 40.47 MB/s │ 43.47 MB/s │ 43.35 MB/s │ │ + │ │ ├─ romeo_and_juliet 3.763 ms │ 3.954 ms │ 3.803 ms │ 3.809 ms │ 100 │ 100 + │ │ │ 43.48 MB/s │ 41.37 MB/s │ 43.02 MB/s │ 42.94 MB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 226 │ 226 │ 226 │ 226 │ │ - │ │ │ 483.5 KB │ 483.5 KB │ 483.5 KB │ 483.5 KB │ │ + │ │ │ 140 │ 140 │ 140 │ 140 │ │ + │ │ │ 406.1 KB │ 406.1 KB │ 406.1 KB │ 406.1 KB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 227 │ 227 │ 227 │ 227 │ │ - │ │ │ 2.136 MB │ 2.136 MB │ 2.136 MB │ 2.136 MB │ │ + │ │ │ 141 │ 141 │ 141 │ 141 │ │ + │ │ │ 1.989 MB │ 1.989 MB │ 1.989 MB │ 1.989 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 579 │ 579 │ 579 │ 579 │ │ - │ │ │ 1.489 MB │ 1.489 MB │ 1.489 MB │ 1.489 MB │ │ - │ │ ╰─ room_with_a_view 5.075 ms │ 5.259 ms │ 5.105 ms │ 5.113 ms │ 100 │ 100 - │ │ 59.48 MB/s │ 57.4 MB/s │ 59.13 MB/s │ 59.03 MB/s │ │ + │ │ │ 424 │ 424 │ 424 │ 424 │ │ + │ │ │ 1.42 MB │ 1.42 MB │ 1.42 MB │ 1.42 MB │ │ + │ │ ╰─ room_with_a_view 5.099 ms │ 5.402 ms │ 5.162 ms │ 5.16 ms │ 100 │ 100 + │ │ 59.2 MB/s │ 55.88 MB/s │ 58.48 MB/s │ 58.5 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 499 │ 499 │ 499 │ 499 │ │ - │ │ 543.1 KB │ 543.1 KB │ 543.1 KB │ 543.1 KB │ │ + │ │ 303 │ 303 │ 303 │ 303 │ │ + │ │ 430 KB │ 430 KB │ 430 KB │ 430 KB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 500 │ 500 │ 500 │ 500 │ │ - │ │ 2.095 MB │ 2.095 MB │ 2.095 MB │ 2.095 MB │ │ + │ │ 304 │ 304 │ 304 │ 304 │ │ + │ │ 1.886 MB │ 1.886 MB │ 1.886 MB │ 1.886 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 1068 │ 1068 │ 1068 │ 1068 │ │ - │ │ 1.25 MB │ 1.25 MB │ 1.25 MB │ 1.25 MB │ │ + │ │ 810 │ 810 │ 810 │ 810 │ │ + │ │ 1.154 MB │ 1.154 MB │ 1.154 MB │ 1.154 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.539 ms │ 1.677 ms │ 1.564 ms │ 1.572 ms │ 100 │ 100 - │ │ 106.3 MB/s │ 97.51 MB/s │ 104.5 MB/s │ 104 MB/s │ │ + │ ├─ romeo_and_juliet 1.6 ms │ 1.764 ms │ 1.624 ms │ 1.626 ms │ 100 │ 100 + │ │ 102.2 MB/s │ 92.73 MB/s │ 100.7 MB/s │ 100.5 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 46 │ 46 │ 46 │ 46 │ │ - │ │ 124.6 KB │ 124.6 KB │ 124.6 KB │ 124.6 KB │ │ + │ │ 32 │ 32 │ 32 │ 32 │ │ + │ │ 106.9 KB │ 106.9 KB │ 106.9 KB │ 106.9 KB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 47 │ 47 │ 47 │ 47 │ │ - │ │ 888.9 KB │ 888.9 KB │ 888.9 KB │ 888.9 KB │ │ + │ │ 33 │ 33 │ 33 │ 33 │ │ + │ │ 868 KB │ 868 KB │ 868 KB │ 868 KB │ │ │ │ grow: │ │ │ │ │ - │ │ 112 │ 112 │ 112 │ 112 │ │ - │ │ 600.6 KB │ 600.6 KB │ 600.6 KB │ 600.6 KB │ │ - │ ╰─ room_with_a_view 1.753 ms │ 1.852 ms │ 1.772 ms │ 1.783 ms │ 100 │ 100 - │ 172.2 MB/s │ 162.9 MB/s │ 170.3 MB/s │ 169.2 MB/s │ │ + │ │ 105 │ 105 │ 105 │ 105 │ │ + │ │ 597.5 KB │ 597.5 KB │ 597.5 KB │ 597.5 KB │ │ + │ ╰─ room_with_a_view 1.839 ms │ 2.015 ms │ 1.885 ms │ 1.878 ms │ 100 │ 100 + │ 164.1 MB/s │ 149.8 MB/s │ 160.1 MB/s │ 160.6 MB/s │ │ │ alloc: │ │ │ │ │ - │ 61 │ 61 │ 61 │ 61 │ │ - │ 60.8 KB │ 60.8 KB │ 60.8 KB │ 60.8 KB │ │ + │ 41 │ 41 │ 41 │ 41 │ │ + │ 55.95 KB │ 55.95 KB │ 55.95 KB │ 55.95 KB │ │ │ dealloc: │ │ │ │ │ - │ 62 │ 62 │ 62 │ 62 │ │ - │ 569.6 KB │ 569.6 KB │ 569.6 KB │ 569.6 KB │ │ + │ 42 │ 42 │ 42 │ 42 │ │ + │ 564.1 KB │ 564.1 KB │ 564.1 KB │ 564.1 KB │ │ │ grow: │ │ │ │ │ - │ 117 │ 117 │ 117 │ 117 │ │ - │ 206.9 KB │ 206.9 KB │ 206.9 KB │ 206.9 KB │ │ + │ 111 │ 111 │ 111 │ 111 │ │ + │ 206.2 KB │ 206.2 KB │ 206.2 KB │ 206.2 KB │ │ ├─ tiktoken │ │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 797.7 ms │ 842.3 ms │ 803.4 ms │ 806.3 ms │ 100 │ 100 - │ │ │ 205.1 KB/s │ 194.2 KB/s │ 203.6 KB/s │ 202.9 KB/s │ │ + │ │ ├─ romeo_and_juliet 798.7 ms │ 813 ms │ 806.1 ms │ 805.7 ms │ 100 │ 100 + │ │ │ 204.8 KB/s │ 201.2 KB/s │ 202.9 KB/s │ 203 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 8759688 │ 8759688 │ 8759688 │ 8759688 │ │ - │ │ │ 416.9 MB │ 416.9 MB │ 416.9 MB │ 416.9 MB │ │ + │ │ │ 8665256 │ 8665256 │ 8665256 │ 8665256 │ │ + │ │ │ 412.1 MB │ 412.1 MB │ 412.1 MB │ 412.1 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 9071196 │ 9071196 │ 9071196 │ 9071196 │ │ - │ │ │ 682.6 MB │ 682.6 MB │ 682.6 MB │ 682.6 MB │ │ + │ │ │ 8976764 │ 8976764 │ 8976764 │ 8976764 │ │ + │ │ │ 675.4 MB │ 675.4 MB │ 675.4 MB │ 675.4 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 1817472 │ 1817472 │ 1817472 │ 1817472 │ │ - │ │ │ 247.2 MB │ 247.2 MB │ 247.2 MB │ 247.2 MB │ │ - │ │ ╰─ room_with_a_view 1.086 s │ 1.146 s │ 1.103 s │ 1.103 s │ 100 │ 100 - │ │ 277.8 KB/s │ 263.3 KB/s │ 273.5 KB/s │ 273.6 KB/s │ │ + │ │ │ 1797014 │ 1797014 │ 1797014 │ 1797014 │ │ + │ │ │ 244.7 MB │ 244.7 MB │ 244.7 MB │ 244.7 MB │ │ + │ │ ╰─ room_with_a_view 1.056 s │ 1.109 s │ 1.066 s │ 1.068 s │ 100 │ 100 + │ │ 285.8 KB/s │ 272.1 KB/s │ 282.9 KB/s │ 282.4 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 11927663 │ 11927663 │ 11927663 │ 11927663 │ │ - │ │ 572.5 MB │ 572.5 MB │ 572.5 MB │ 572.5 MB │ │ + │ │ 11472402 │ 11472402 │ 11472402 │ 11472402 │ │ + │ │ 550.5 MB │ 550.5 MB │ 550.5 MB │ 550.5 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 12239171 │ 12239171 │ 12239171 │ 12239171 │ │ - │ │ 974.9 MB │ 974.9 MB │ 974.9 MB │ 974.9 MB │ │ + │ │ 11783910 │ 11783910 │ 11783910 │ 11783910 │ │ + │ │ 939 MB │ 939 MB │ 939 MB │ 939 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 2940302 │ 2940302 │ 2940302 │ 2940302 │ │ - │ │ 383.7 MB │ 383.7 MB │ 383.7 MB │ 383.7 MB │ │ + │ │ 2826842 │ 2826842 │ 2826842 │ 2826842 │ │ + │ │ 369.8 MB │ 369.8 MB │ 369.8 MB │ 369.8 MB │ │ │ ├─ 512 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 265.4 ms │ 269.1 ms │ 266.4 ms │ 266.6 ms │ 100 │ 100 - │ │ │ 616.4 KB/s │ 607.8 KB/s │ 614 KB/s │ 613.6 KB/s │ │ + │ │ ├─ romeo_and_juliet 262.9 ms │ 267.2 ms │ 264.8 ms │ 264.8 ms │ 100 │ 100 + │ │ │ 622.1 KB/s │ 612.1 KB/s │ 617.8 KB/s │ 617.7 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2967096 │ 2967096 │ 2967096 │ 2967096 │ │ - │ │ │ 139.6 MB │ 139.6 MB │ 139.6 MB │ 139.6 MB │ │ + │ │ │ 2919900 │ 2919900 │ 2919900 │ 2919900 │ │ + │ │ │ 137.3 MB │ 137.3 MB │ 137.3 MB │ 137.3 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 3278604 │ 3278604 │ 3278604 │ 3278604 │ │ - │ │ │ 237.9 MB │ 237.9 MB │ 237.9 MB │ 237.9 MB │ │ + │ │ │ 3231408 │ 3231408 │ 3231408 │ 3231408 │ │ + │ │ │ 234.3 MB │ 234.3 MB │ 234.3 MB │ 234.3 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 615666 │ 615666 │ 615666 │ 615666 │ │ - │ │ │ 79.86 MB │ 79.86 MB │ 79.86 MB │ 79.86 MB │ │ - │ │ ╰─ room_with_a_view 464.5 ms │ 470.8 ms │ 467.7 ms │ 467.6 ms │ 100 │ 100 - │ │ 649.9 KB/s │ 641.1 KB/s │ 645.3 KB/s │ 645.6 KB/s │ │ + │ │ │ 605774 │ 605774 │ 605774 │ 605774 │ │ + │ │ │ 78.58 MB │ 78.58 MB │ 78.58 MB │ 78.58 MB │ │ + │ │ ╰─ room_with_a_view 441.8 ms │ 474.8 ms │ 444.3 ms │ 445 ms │ 100 │ 100 + │ │ 683.2 KB/s │ 635.7 KB/s │ 679.3 KB/s │ 678.3 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 5146321 │ 5146321 │ 5146321 │ 5146321 │ │ - │ │ 245 MB │ 245 MB │ 245 MB │ 245 MB │ │ + │ │ 4878812 │ 4878812 │ 4878812 │ 4878812 │ │ + │ │ 232.2 MB │ 232.2 MB │ 232.2 MB │ 232.2 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 5457829 │ 5457829 │ 5457829 │ 5457829 │ │ - │ │ 424.8 MB │ 424.8 MB │ 424.8 MB │ 424.8 MB │ │ + │ │ 5190320 │ 5190320 │ 5190320 │ 5190320 │ │ + │ │ 403.7 MB │ 403.7 MB │ 403.7 MB │ 403.7 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 1262554 │ 1262554 │ 1262554 │ 1262554 │ │ - │ │ 161.2 MB │ 161.2 MB │ 161.2 MB │ 161.2 MB │ │ + │ │ 1196748 │ 1196748 │ 1196748 │ 1196748 │ │ + │ │ 152.8 MB │ 152.8 MB │ 152.8 MB │ 152.8 MB │ │ │ ├─ 4096 │ │ │ │ │ - │ │ ├─ romeo_and_juliet 184.2 ms │ 187.1 ms │ 185.2 ms │ 185.2 ms │ 100 │ 100 - │ │ │ 888.1 KB/s │ 874.2 KB/s │ 883.2 KB/s │ 883 KB/s │ │ + │ │ ├─ romeo_and_juliet 179.6 ms │ 183.5 ms │ 180.5 ms │ 180.7 ms │ 100 │ 100 + │ │ │ 910.6 KB/s │ 891.5 KB/s │ 906.1 KB/s │ 905.2 KB/s │ │ │ │ │ alloc: │ │ │ │ │ - │ │ │ 2079255 │ 2079255 │ 2079255 │ 2079255 │ │ - │ │ │ 97.41 MB │ 97.41 MB │ 97.41 MB │ 97.41 MB │ │ + │ │ │ 2017987 │ 2017987 │ 2017987 │ 2017987 │ │ + │ │ │ 94.49 MB │ 94.49 MB │ 94.49 MB │ 94.49 MB │ │ │ │ │ dealloc: │ │ │ │ │ - │ │ │ 2390763 │ 2390763 │ 2390763 │ 2390763 │ │ - │ │ │ 170.8 MB │ 170.8 MB │ 170.8 MB │ 170.8 MB │ │ + │ │ │ 2329495 │ 2329495 │ 2329495 │ 2329495 │ │ + │ │ │ 166.3 MB │ 166.3 MB │ 166.3 MB │ 166.3 MB │ │ │ │ │ grow: │ │ │ │ │ - │ │ │ 431092 │ 431092 │ 431092 │ 431092 │ │ - │ │ │ 55 MB │ 55 MB │ 55 MB │ 55 MB │ │ - │ │ ╰─ room_with_a_view 339.3 ms │ 345.3 ms │ 341.7 ms │ 341.9 ms │ 100 │ 100 - │ │ 889.7 KB/s │ 874.3 KB/s │ 883.4 KB/s │ 882.8 KB/s │ │ + │ │ │ 418382 │ 418382 │ 418382 │ 418382 │ │ + │ │ │ 53.35 MB │ 53.35 MB │ 53.35 MB │ 53.35 MB │ │ + │ │ ╰─ room_with_a_view 322 ms │ 326.7 ms │ 323.5 ms │ 323.8 ms │ 100 │ 100 + │ │ 937.5 KB/s │ 924 KB/s │ 932.9 KB/s │ 932.2 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 3773135 │ 3773135 │ 3773135 │ 3773135 │ │ - │ │ 179.2 MB │ 179.2 MB │ 179.2 MB │ 179.2 MB │ │ + │ │ 3572528 │ 3572528 │ 3572528 │ 3572528 │ │ + │ │ 169.7 MB │ 169.7 MB │ 169.7 MB │ 169.7 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 4084643 │ 4084643 │ 4084643 │ 4084643 │ │ - │ │ 315 MB │ 315 MB │ 315 MB │ 315 MB │ │ + │ │ 3884036 │ 3884036 │ 3884036 │ 3884036 │ │ + │ │ 299.1 MB │ 299.1 MB │ 299.1 MB │ 299.1 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 923647 │ 923647 │ 923647 │ 923647 │ │ - │ │ 117.1 MB │ 117.1 MB │ 117.1 MB │ 117.1 MB │ │ + │ │ 874353 │ 874353 │ 874353 │ 874353 │ │ + │ │ 110.8 MB │ 110.8 MB │ 110.8 MB │ 110.8 MB │ │ │ ╰─ 32768 │ │ │ │ │ - │ ├─ romeo_and_juliet 82.98 ms │ 89.91 ms │ 83.42 ms │ 83.82 ms │ 100 │ 100 - │ │ 1.971 MB/s │ 1.819 MB/s │ 1.961 MB/s │ 1.951 MB/s │ │ + │ ├─ romeo_and_juliet 83.13 ms │ 85.75 ms │ 84.12 ms │ 84.1 ms │ 100 │ 100 + │ │ 1.968 MB/s │ 1.907 MB/s │ 1.944 MB/s │ 1.945 MB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 915519 │ 915519 │ 915519 │ 915519 │ │ - │ │ 42.89 MB │ 42.89 MB │ 42.89 MB │ 42.89 MB │ │ + │ │ 914586 │ 914586 │ 914586 │ 914586 │ │ + │ │ 42.85 MB │ 42.85 MB │ 42.85 MB │ 42.85 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 1227027 │ 1227027 │ 1227027 │ 1227027 │ │ - │ │ 85.77 MB │ 85.77 MB │ 85.77 MB │ 85.77 MB │ │ + │ │ 1226094 │ 1226094 │ 1226094 │ 1226094 │ │ + │ │ 85.7 MB │ 85.7 MB │ 85.7 MB │ 85.7 MB │ │ │ │ grow: │ │ │ │ │ - │ │ 187921 │ 187921 │ 187921 │ 187921 │ │ - │ │ 24.4 MB │ 24.4 MB │ 24.4 MB │ 24.4 MB │ │ - │ ╰─ room_with_a_view 125.2 ms │ 129.9 ms │ 126.6 ms │ 126.8 ms │ 100 │ 100 - │ 2.409 MB/s │ 2.323 MB/s │ 2.384 MB/s │ 2.38 MB/s │ │ + │ │ 187680 │ 187680 │ 187680 │ 187680 │ │ + │ │ 24.37 MB │ 24.37 MB │ 24.37 MB │ 24.37 MB │ │ + │ ╰─ room_with_a_view 113.8 ms │ 117 ms │ 114.7 ms │ 114.8 ms │ 100 │ 100 + │ 2.652 MB/s │ 2.578 MB/s │ 2.631 MB/s │ 2.628 MB/s │ │ │ alloc: │ │ │ │ │ - │ 1370555 │ 1370555 │ 1370555 │ 1370555 │ │ - │ 65.15 MB │ 65.15 MB │ 65.15 MB │ 65.15 MB │ │ + │ 1232390 │ 1232390 │ 1232390 │ 1232390 │ │ + │ 58.6 MB │ 58.6 MB │ 58.6 MB │ 58.6 MB │ │ │ dealloc: │ │ │ │ │ - │ 1682063 │ 1682063 │ 1682063 │ 1682063 │ │ - │ 126.3 MB │ 126.3 MB │ 126.3 MB │ 126.3 MB │ │ + │ 1543898 │ 1543898 │ 1543898 │ 1543898 │ │ + │ 115.4 MB │ 115.4 MB │ 115.4 MB │ 115.4 MB │ │ │ grow: │ │ │ │ │ - │ 334112 │ 334112 │ 334112 │ 334112 │ │ - │ 42.55 MB │ 42.55 MB │ 42.55 MB │ 42.55 MB │ │ + │ 300721 │ 300721 │ 300721 │ 300721 │ │ + │ 38.19 MB │ 38.19 MB │ 38.19 MB │ 38.19 MB │ │ ╰─ tokenizers │ │ │ │ │ ├─ 64 │ │ │ │ │ - │ ├─ romeo_and_juliet 1.409 s │ 1.519 s │ 1.426 s │ 1.43 s │ 100 │ 100 - │ │ 116.1 KB/s │ 107.6 KB/s │ 114.6 KB/s │ 114.3 KB/s │ │ + │ ├─ romeo_and_juliet 1.393 s │ 1.525 s │ 1.404 s │ 1.409 s │ 100 │ 100 + │ │ 117.4 KB/s │ 107.2 KB/s │ 116.5 KB/s │ 116 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 29414066 │ 29414066 │ 29414066 │ 29414066 │ │ - │ │ 3.628 GB │ 3.628 GB │ 3.628 GB │ 3.628 GB │ │ + │ │ 29070934 │ 29070934 │ 29070934 │ 29070934 │ │ + │ │ 3.591 GB │ 3.591 GB │ 3.591 GB │ 3.591 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 29472119 │ 29472119 │ 29472119 │ 29472119 │ │ - │ │ 5.252 GB │ 5.252 GB │ 5.252 GB │ 5.252 GB │ │ + │ │ 29128987 │ 29128987 │ 29128987 │ 29128987 │ │ + │ │ 5.201 GB │ 5.201 GB │ 5.201 GB │ 5.201 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 474777 │ 474777 │ 474777 │ 474777 │ │ - │ │ 1.619 GB │ 1.619 GB │ 1.619 GB │ 1.619 GB │ │ - │ ╰─ room_with_a_view 1.888 s │ 2.313 s │ 1.954 s │ 1.977 s │ 100 │ 100 - │ 159.8 KB/s │ 130.4 KB/s │ 154.4 KB/s │ 152.6 KB/s │ │ + │ │ 442951 │ 442951 │ 442951 │ 442951 │ │ + │ │ 1.605 GB │ 1.605 GB │ 1.605 GB │ 1.605 GB │ │ + │ ╰─ room_with_a_view 1.94 s │ 2.063 s │ 1.952 s │ 1.953 s │ 100 │ 100 + │ 155.5 KB/s │ 146.3 KB/s │ 154.6 KB/s │ 154.5 KB/s │ │ │ alloc: │ │ │ │ │ - │ 40892031 │ 40892031 │ 40892031 │ 40892031 │ │ - │ 5.348 GB │ 5.348 GB │ 5.348 GB │ 5.348 GB │ │ + │ 39268765 │ 39268765 │ 39268765 │ 39268765 │ │ + │ 5.144 GB │ 5.144 GB │ 5.144 GB │ 5.144 GB │ │ │ dealloc: │ │ │ │ │ - │ 40950084 │ 40950084 │ 40950084 │ 40950084 │ │ - │ 7.688 GB │ 7.688 GB │ 7.688 GB │ 7.688 GB │ │ + │ 39326818 │ 39326818 │ 39326818 │ 39326818 │ │ + │ 7.402 GB │ 7.402 GB │ 7.402 GB │ 7.402 GB │ │ │ grow: │ │ │ │ │ - │ 758413 │ 758413 │ 758413 │ 758413 │ │ - │ 2.335 GB │ 2.335 GB │ 2.335 GB │ 2.335 GB │ │ + │ 663372 │ 663372 │ 663372 │ 663372 │ │ + │ 2.252 GB │ 2.252 GB │ 2.252 GB │ 2.252 GB │ │ ├─ 512 │ │ │ │ │ - │ ├─ romeo_and_juliet 433.3 ms │ 460.4 ms │ 445.9 ms │ 442 ms │ 100 │ 100 - │ │ 377.6 KB/s │ 355.3 KB/s │ 366.9 KB/s │ 370.1 KB/s │ │ + │ ├─ romeo_and_juliet 437.1 ms │ 456.4 ms │ 440.9 ms │ 440.9 ms │ 100 │ 100 + │ │ 374.2 KB/s │ 358.4 KB/s │ 371 KB/s │ 371 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 9473831 │ 9473831 │ 9473831 │ 9473831 │ │ - │ │ 1.177 GB │ 1.177 GB │ 1.177 GB │ 1.177 GB │ │ + │ │ 9319799 │ 9319799 │ 9319799 │ 9319799 │ │ + │ │ 1.158 GB │ 1.158 GB │ 1.158 GB │ 1.158 GB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 9531884 │ 9531884 │ 9531884 │ 9531884 │ │ - │ │ 1.703 GB │ 1.703 GB │ 1.703 GB │ 1.703 GB │ │ + │ │ 9377852 │ 9377852 │ 9377852 │ 9377852 │ │ + │ │ 1.676 GB │ 1.676 GB │ 1.676 GB │ 1.676 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 101662 │ 101662 │ 101662 │ 101662 │ │ - │ │ 521.2 MB │ 521.2 MB │ 521.2 MB │ 521.2 MB │ │ - │ ╰─ room_with_a_view 818.3 ms │ 841.4 ms │ 824.1 ms │ 824.6 ms │ 100 │ 100 - │ 368.9 KB/s │ 358.8 KB/s │ 366.3 KB/s │ 366.1 KB/s │ │ + │ │ 98200 │ 98200 │ 98200 │ 98200 │ │ + │ │ 513.5 MB │ 513.5 MB │ 513.5 MB │ 513.5 MB │ │ + │ ╰─ room_with_a_view 798.8 ms │ 808.1 ms │ 802.7 ms │ 802.8 ms │ 100 │ 100 + │ 377.9 KB/s │ 373.5 KB/s │ 376.1 KB/s │ 376 KB/s │ │ │ alloc: │ │ │ │ │ - │ 17221349 │ 17221349 │ 17221349 │ 17221349 │ │ - │ 2.27 GB │ 2.27 GB │ 2.27 GB │ 2.27 GB │ │ + │ 16325652 │ 16325652 │ 16325652 │ 16325652 │ │ + │ 2.153 GB │ 2.153 GB │ 2.153 GB │ 2.153 GB │ │ │ dealloc: │ │ │ │ │ - │ 17279402 │ 17279402 │ 17279402 │ 17279402 │ │ - │ 3.271 GB │ 3.271 GB │ 3.271 GB │ 3.271 GB │ │ + │ 16383705 │ 16383705 │ 16383705 │ 16383705 │ │ + │ 3.104 GB │ 3.104 GB │ 3.104 GB │ 3.104 GB │ │ │ grow: │ │ │ │ │ - │ 178717 │ 178717 │ 178717 │ 178717 │ │ - │ 995.5 MB │ 995.5 MB │ 995.5 MB │ 995.5 MB │ │ + │ 163919 │ 163919 │ 163919 │ 163919 │ │ + │ 945.4 MB │ 945.4 MB │ 945.4 MB │ 945.4 MB │ │ ├─ 4096 │ │ │ │ │ - │ ├─ romeo_and_juliet 301.8 ms │ 307.4 ms │ 303.1 ms │ 303.2 ms │ 100 │ 100 - │ │ 542 KB/s │ 532.2 KB/s │ 539.8 KB/s │ 539.5 KB/s │ │ + │ ├─ romeo_and_juliet 298.5 ms │ 301.9 ms │ 299.6 ms │ 299.6 ms │ 100 │ 100 + │ │ 548.1 KB/s │ 541.9 KB/s │ 546 KB/s │ 546 KB/s │ │ │ │ alloc: │ │ │ │ │ - │ │ 6629037 │ 6629037 │ 6629037 │ 6629037 │ │ - │ │ 826.9 MB │ 826.9 MB │ 826.9 MB │ 826.9 MB │ │ + │ │ 6430819 │ 6430819 │ 6430819 │ 6430819 │ │ + │ │ 802.3 MB │ 802.3 MB │ 802.3 MB │ 802.3 MB │ │ │ │ dealloc: │ │ │ │ │ - │ │ 6687090 │ 6687090 │ 6687090 │ 6687090 │ │ - │ │ 1.199 GB │ 1.199 GB │ 1.199 GB │ 1.199 GB │ │ + │ │ 6488872 │ 6488872 │ 6488872 │ 6488872 │ │ + │ │ 1.164 GB │ 1.164 GB │ 1.164 GB │ 1.164 GB │ │ │ │ grow: │ │ │ │ │ - │ │ 35655 │ 35655 │ 35655 │ 35655 │ │ - │ │ 367.7 MB │ 367.7 MB │ 367.7 MB │ 367.7 MB │ │ - │ ╰─ room_with_a_view 583.7 ms │ 595.6 ms │ 588.2 ms │ 588.8 ms │ 100 │ 100 - │ 517.1 KB/s │ 506.8 KB/s │ 513.2 KB/s │ 512.6 KB/s │ │ + │ │ 33980 │ 33980 │ 33980 │ 33980 │ │ + │ │ 356.8 MB │ 356.8 MB │ 356.8 MB │ 356.8 MB │ │ + │ ╰─ room_with_a_view 565.2 ms │ 593.8 ms │ 568.9 ms │ 570 ms │ 100 │ 100 + │ 534.1 KB/s │ 508.3 KB/s │ 530.6 KB/s │ 529.6 KB/s │ │ │ alloc: │ │ │ │ │ - │ 12332767 │ 12332767 │ 12332767 │ 12332767 │ │ - │ 1.632 GB │ 1.632 GB │ 1.632 GB │ 1.632 GB │ │ + │ 11601946 │ 11601946 │ 11601946 │ 11601946 │ │ + │ 1.535 GB │ 1.535 GB │ 1.535 GB │ 1.535 GB │ │ │ dealloc: │ │ │ │ │ - │ 12390820 │ 12390820 │ 12390820 │ 12390820 │ │ - │ 2.353 GB │ 2.353 GB │ 2.353 GB │ 2.353 GB │ │ + │ 11659999 │ 11659999 │ 11659999 │ 11659999 │ │ + │ 2.215 GB │ 2.215 GB │ 2.215 GB │ 2.215 GB │ │ │ grow: │ │ │ │ │ - │ 61498 │ 61498 │ 61498 │ 61498 │ │ - │ 716.5 MB │ 716.5 MB │ 716.5 MB │ 716.5 MB │ │ + │ 55342 │ 55342 │ 55342 │ 55342 │ │ + │ 674.3 MB │ 674.3 MB │ 674.3 MB │ 674.3 MB │ │ ╰─ 32768 │ │ │ │ │ - ├─ romeo_and_juliet 132.7 ms │ 136.2 ms │ 133.1 ms │ 133.4 ms │ 100 │ 100 - │ 1.232 MB/s │ 1.2 MB/s │ 1.228 MB/s │ 1.226 MB/s │ │ + ├─ romeo_and_juliet 132.5 ms │ 135.2 ms │ 133 ms │ 133.1 ms │ 100 │ 100 + │ 1.234 MB/s │ 1.209 MB/s │ 1.23 MB/s │ 1.228 MB/s │ │ │ alloc: │ │ │ │ │ - │ 2848170 │ 2848170 │ 2848170 │ 2848170 │ │ - │ 354.1 MB │ 354.1 MB │ 354.1 MB │ 354.1 MB │ │ + │ 2844814 │ 2844814 │ 2844814 │ 2844814 │ │ + │ 353.7 MB │ 353.7 MB │ 353.7 MB │ 353.7 MB │ │ │ dealloc: │ │ │ │ │ - │ 2906223 │ 2906223 │ 2906223 │ 2906223 │ │ - │ 518 MB │ 518 MB │ 518 MB │ 518 MB │ │ + │ 2902867 │ 2902867 │ 2902867 │ 2902867 │ │ + │ 517.4 MB │ 517.4 MB │ 517.4 MB │ 517.4 MB │ │ │ grow: │ │ │ │ │ - │ 9817 │ 9817 │ 9817 │ 9817 │ │ - │ 159 MB │ 159 MB │ 159 MB │ 159 MB │ │ - ╰─ room_with_a_view 220.3 ms │ 232.3 ms │ 221 ms │ 221.5 ms │ 100 │ 100 - 1.369 MB/s │ 1.299 MB/s │ 1.365 MB/s │ 1.362 MB/s │ │ + │ 9579 │ 9579 │ 9579 │ 9579 │ │ + │ 158.8 MB │ 158.8 MB │ 158.8 MB │ 158.8 MB │ │ + ╰─ room_with_a_view 220.5 ms │ 223.2 ms │ 221.3 ms │ 221.4 ms │ 100 │ 100 + 1.369 MB/s │ 1.352 MB/s │ 1.363 MB/s │ 1.363 MB/s │ │ alloc: │ │ │ │ │ - 4491022 │ 4491022 │ 4491022 │ 4491022 │ │ - 594.4 MB │ 594.4 MB │ 594.4 MB │ 594.4 MB │ │ + 4489764 │ 4489764 │ 4489764 │ 4489764 │ │ + 594.3 MB │ 594.3 MB │ 594.3 MB │ 594.3 MB │ │ dealloc: │ │ │ │ │ - 4549075 │ 4549075 │ 4549075 │ 4549075 │ │ - 861 MB │ 861 MB │ 861 MB │ 861 MB │ │ + 4547817 │ 4547817 │ 4547817 │ 4547817 │ │ + 860.8 MB │ 860.8 MB │ 860.8 MB │ 860.8 MB │ │ grow: │ │ │ │ │ - 14385 │ 14385 │ 14385 │ 14385 │ │ + 14170 │ 14170 │ 14170 │ 14170 │ │ 261.5 MB │ 261.5 MB │ 261.5 MB │ 261.5 MB │ │ diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 4014fc3..6bd15bc 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -2,6 +2,19 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "ahash" +version = "0.8.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" +dependencies = [ + "cfg-if", + "getrandom", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.2" @@ -757,6 +770,7 @@ checksum = "e1fc403891a21bcfb7c37834ba66a547a8f402146eba7265b5a6d88059c9ff2f" name = "text-splitter" version = "0.7.0" dependencies = [ + "ahash", "auto_enums", "either", "itertools 0.12.1", @@ -944,3 +958,23 @@ name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "zerocopy" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.52", +] diff --git a/src/lib.rs b/src/lib.rs index 5e1db3d..da96d16 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6,6 +6,7 @@ use std::{ ops::{Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}, }; +use ahash::AHashMap; use itertools::Itertools; mod characters; @@ -83,6 +84,62 @@ pub trait ChunkSizer { fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize; } +/// A memoized chunk sizer that caches the size of chunks. +/// Very helpful when the same chunk is being validated multiple times, which +/// happens often, and can be expensive to compute, such as with tokenizers. +#[derive(Debug)] +struct MemoizedChunkSizer<'sizer, C, S> +where + C: ChunkCapacity, + S: ChunkSizer, +{ + /// Cache of chunk sizes per byte offset range + cache: AHashMap, ChunkSize>, + /// How big can each chunk be + chunk_capacity: C, + /// The sizer we are wrapping + sizer: &'sizer S, +} + +impl<'sizer, C, S> MemoizedChunkSizer<'sizer, C, S> +where + C: ChunkCapacity, + S: ChunkSizer, +{ + /// Wrap any chunk sizer for memoization + fn new(chunk_capacity: C, sizer: &'sizer S) -> Self { + Self { + cache: AHashMap::new(), + chunk_capacity, + sizer, + } + } + + /// Determine the size of a given chunk to use for validation, + /// returning a cached value if it exists, and storing the result if not. + fn chunk_size(&mut self, offset: usize, chunk: &str) -> ChunkSize { + *self + .cache + .entry(offset..(offset + chunk.len())) + .or_insert_with(|| self.sizer.chunk_size(chunk, &self.chunk_capacity)) + } + + /// Check if the chunk is within the capacity. Chunk should be trimmed if necessary beforehand. + fn check_capacity(&mut self, (offset, chunk): (usize, &str)) -> ChunkSize { + let mut chunk_size = self.chunk_size(offset, chunk); + if let Some(max_chunk_size_offset) = chunk_size.max_chunk_size_offset.as_mut() { + *max_chunk_size_offset += offset; + } + chunk_size + } + + /// Clear the cached values. Once we've moved the cursor, + /// we don't need to keep the old values around. + fn clear_cache(&mut self) { + self.cache.clear(); + } +} + /// Describes the largest valid chunk size(s) that can be generated. /// /// An `end` size is required, which is the maximum possible chunk size that @@ -210,18 +267,21 @@ trait SemanticSplit { /// Generate a new instance from a given text. fn new(text: &str) -> Self; - /// Retrieve ranges for each semantic level in the entire text - fn ranges(&self) -> impl Iterator)> + '_; + /// Retrieve ranges for each semantic level in the entire text that appear after a given offset + fn ranges_after_offset( + &self, + offset: usize, + ) -> impl Iterator)> + '_; /// Retrieve ranges for all sections of a given level after an offset - fn ranges_after_offset( + fn level_ranges_after_offset( &self, offset: usize, level: Self::Level, ) -> impl Iterator)> + '_ { - let first_item = self.ranges().find(|(l, _)| l == &level); - self.ranges() - .filter(move |(l, sep)| l >= &level && sep.start >= offset) + let first_item = self.ranges_after_offset(offset).find(|(l, _)| l == &level); + self.ranges_after_offset(offset) + .filter(move |(l, _)| l >= &level) .skip_while(move |(l, r)| { first_item.is_some_and(|(_, fir)| l > &level && r.contains(&fir.start)) }) @@ -230,11 +290,7 @@ trait SemanticSplit { /// Return a unique, sorted list of all line break levels present before the next max level, added /// to all of the base semantic levels, in order from smallest to largest fn levels_in_remaining_text(&self, offset: usize) -> impl Iterator + '_ { - let existing_levels = self - .ranges() - // Only start taking them from the offset - .filter(|(_, sep)| sep.start >= offset) - .map(|(l, _)| l); + let existing_levels = self.ranges_after_offset(offset).map(|(l, _)| l); Self::PERSISTENT_LEVELS .iter() @@ -263,6 +319,9 @@ trait SemanticSplit { let diff = chunk.len() - chunk.trim_start().len(); (offset + diff, chunk.trim()) } + + /// Allows the impl to clear out unnecessary data after the cursor has moved. + fn update_ranges(&mut self, _cursor: usize); } /// Returns chunks of text with their byte offsets as an iterator. @@ -273,12 +332,12 @@ where S: ChunkSizer, Sp: SemanticSplit, { - /// Size of the chunks to generate - chunk_capacity: C, /// How to validate chunk sizes - chunk_sizer: &'sizer S, + chunk_sizer: MemoizedChunkSizer<'sizer, C, S>, /// Current byte offset in the `text` cursor: usize, + /// Reusable container for next sections to avoid extra allocations + next_sections: Vec<(usize, &'text str)>, /// Splitter used for determining semantic levels. semantic_split: Sp, /// Original text to iterate over and generate chunks from @@ -298,8 +357,8 @@ where fn new(chunk_capacity: C, chunk_sizer: &'sizer S, text: &'text str, trim_chunks: bool) -> Self { Self { cursor: 0, - chunk_capacity, - chunk_sizer, + chunk_sizer: MemoizedChunkSizer::new(chunk_capacity, chunk_sizer), + next_sections: Vec::new(), semantic_split: Sp::new(text), text, trim_chunks, @@ -315,40 +374,31 @@ where } } - /// Is the given text within the chunk size? - fn check_capacity(&self, offset: usize, chunk: &str) -> ChunkSize { - let (offset, chunk) = self.trim_chunk(offset, chunk); - let mut chunk_size = self.chunk_sizer.chunk_size(chunk, &self.chunk_capacity); - if let Some(max_chunk_size_offset) = chunk_size.max_chunk_size_offset.as_mut() { - *max_chunk_size_offset += offset; - } - chunk_size - } - /// Generate the next chunk, applying trimming settings. /// Returns final byte offset and str. /// Will return `None` if given an invalid range. fn next_chunk(&mut self) -> Option<(usize, &'text str)> { + // Reset caches so we can reuse the memory allocation + self.chunk_sizer.clear_cache(); + self.semantic_split.update_ranges(self.cursor); + self.update_next_sections(); + let start = self.cursor; let mut end = self.cursor; let mut equals_found = false; - - let sections = self.next_sections()?.collect::>(); - let mut sizes = sections - .iter() - .map(|_| None) - .collect::>>(); let mut low = 0; - let mut high = sections.len().saturating_sub(1); + let mut high = self.next_sections.len().saturating_sub(1); let mut successful_index = None; + let mut successful_chunk_size = None; while low <= high { let mid = low + (high - low) / 2; - let (offset, str) = sections[mid]; + let (offset, str) = self.next_sections[mid]; let text_end = offset + str.len(); let chunk = self.text.get(start..text_end)?; - let chunk_size = self.check_capacity(start, chunk); - sizes[mid] = Some(chunk_size); + let chunk_size = self + .chunk_sizer + .check_capacity(self.trim_chunk(start, chunk)); match chunk_size.fits { Ordering::Less => { @@ -356,6 +406,7 @@ where if text_end > end { end = text_end; successful_index = Some(mid); + successful_chunk_size = Some(chunk_size); } } Ordering::Equal => { @@ -363,6 +414,7 @@ where if text_end < end || !equals_found { end = text_end; successful_index = Some(mid); + successful_chunk_size = Some(chunk_size); } equals_found = true; } @@ -371,6 +423,7 @@ where if mid == 0 && start == end { end = text_end; successful_index = Some(mid); + successful_chunk_size = Some(chunk_size); } } }; @@ -387,32 +440,26 @@ where } // Sometimes with tokenization, we can get a bigger chunk for the same amount of tokens. - if let Some((successful_index, chunk_size)) = - successful_index.and_then(|successful_index| { - Some((successful_index, sizes.get(successful_index)?.as_ref()?)) - }) + if let (Some(successful_index), Some(chunk_size)) = + (successful_index, successful_chunk_size) { - for (size, (offset, str)) in sizes.iter().zip(sections).skip(successful_index) { + let mut range = successful_index..self.next_sections.len(); + // We've already checked the successful index + range.next(); + + for index in range { + let (offset, str) = self.next_sections[index]; let text_end = offset + str.len(); - match size { - Some(size) if size.size <= chunk_size.size => { - if text_end > end { - end = text_end; - } - } - // We didn't tokenize this section yet - None => { - let chunk = self.text.get(start..text_end)?; - let size = self.check_capacity(start, chunk); - if size.size <= chunk_size.size { - if text_end > end { - end = text_end; - } - } else { - break; - } + let chunk = self.text.get(start..text_end)?; + let size = self + .chunk_sizer + .check_capacity(self.trim_chunk(start, chunk)); + if size.size <= chunk_size.size { + if text_end > end { + end = text_end; } - _ => break, + } else { + break; } } } @@ -428,21 +475,41 @@ where /// Find the ideal next sections, breaking it up until we find the largest chunk. /// Increasing length of chunk until we find biggest size to minimize validation time /// on huge chunks - fn next_sections(&'sizer self) -> Option + 'sizer> { - // Next levels to try. Will stop at max level. We check only levels in the next max level - // chunk so we don't bypass it if not all levels are present in every chunk. - let mut levels = self.semantic_split.levels_in_remaining_text(self.cursor); + fn update_next_sections(&mut self) { + // First thing, clear out the list, but reuse the allocated memory + self.next_sections.clear(); // Get starting level - let mut semantic_level = levels.next()?; + let mut levels_in_remaining_text = + self.semantic_split.levels_in_remaining_text(self.cursor); + let mut semantic_level = levels_in_remaining_text + .next() + .expect("Need at least one level to progress"); // If we aren't at the highest semantic level, stop iterating sections that go beyond the range of the next level. let mut max_encoded_offset = None; - for level in levels { - let (_, str) = self.semantic_chunks(level).next()?; - let chunk_size = self.check_capacity(self.cursor, str); - // If this no longer fits, we use the level we are at. Or if we already - // have the rest of the string - if chunk_size.fits.is_gt() || self.text.get(self.cursor..)? == str { + let remaining_text = self.text.get(self.cursor..).unwrap(); + + let levels_with_chunks = levels_in_remaining_text + .filter_map(|level| { + self.semantic_split + .semantic_chunks(self.cursor, remaining_text, level) + .next() + .map(|(_, str)| (level, str)) + }) + // We assume that larger levels are also longer. We can skip lower levels if going to a higher level would result in a shorter text + .coalesce(|(a_level, a_str), (b_level, b_str)| { + if a_str.len() >= b_str.len() { + Ok((b_level, b_str)) + } else { + Err(((a_level, a_str), (b_level, b_str))) + } + }); + for (level, str) in levels_with_chunks { + let chunk_size = self + .chunk_sizer + .check_capacity(self.trim_chunk(self.cursor, str)); + // If this no longer fits, we use the level we are at. + if chunk_size.fits.is_gt() { max_encoded_offset = chunk_size.max_chunk_size_offset; break; } @@ -450,27 +517,18 @@ where semantic_level = level; } - Some( - self.semantic_chunks(semantic_level) - // We don't want to return items at this level that go beyond the next highest semantic level, as that is most - // likely a meaningful breakpoint we want to preserve. We already know that the next highest doesn't fit anyway, - // so we should be safe to break once we reach it. - .take_while_inclusive(move |(offset, _)| { - max_encoded_offset.map_or(true, |max| offset <= &max) - }) - .filter(|(_, str)| !str.is_empty()), - ) - } - - fn semantic_chunks( - &'sizer self, - level: ::Level, - ) -> impl Iterator + 'sizer { - self.semantic_split.semantic_chunks( - self.cursor, - self.text.get(self.cursor..).unwrap(), - level, - ) + let sections = self + .semantic_split + .semantic_chunks(self.cursor, remaining_text, semantic_level) + // We don't want to return items at this level that go beyond the next highest semantic level, as that is most + // likely a meaningful breakpoint we want to preserve. We already know that the next highest doesn't fit anyway, + // so we should be safe to break once we reach it. + .take_while_inclusive(move |(offset, _)| { + max_encoded_offset.map_or(true, |max| offset <= &max) + }) + .filter(|(_, str)| !str.is_empty()); + + self.next_sections.extend(sections); } } @@ -501,6 +559,8 @@ where #[cfg(test)] mod tests { + use std::sync::atomic::{self, AtomicUsize}; + use super::*; #[test] @@ -622,4 +682,60 @@ mod tests { chunk_size ); } + + #[derive(Default)] + struct CountingSizer { + calls: AtomicUsize, + } + + impl ChunkSizer for CountingSizer { + // Return character version, but count calls + fn chunk_size(&self, chunk: &str, capacity: &impl ChunkCapacity) -> ChunkSize { + self.calls.fetch_add(1, atomic::Ordering::SeqCst); + Characters.chunk_size(chunk, capacity) + } + } + + #[test] + fn memoized_sizer_only_calculates_once_per_text() { + let sizer = CountingSizer::default(); + let mut memoized_sizer = MemoizedChunkSizer::new(10, &sizer); + let text = "1234567890"; + for _ in 0..10 { + memoized_sizer.chunk_size(0, text); + } + + assert_eq!(memoized_sizer.sizer.calls.load(atomic::Ordering::SeqCst), 1); + } + + #[test] + fn memoized_sizer_calculates_once_per_different_text() { + let sizer = CountingSizer::default(); + let mut memoized_sizer = MemoizedChunkSizer::new(10, &sizer); + let text = "1234567890"; + for i in 0..10 { + memoized_sizer.chunk_size(0, text.get(0..i).unwrap()); + } + + assert_eq!( + memoized_sizer.sizer.calls.load(atomic::Ordering::SeqCst), + 10 + ); + } + + #[test] + fn can_clear_cache_on_memoized_sizer() { + let sizer = CountingSizer::default(); + let mut memoized_sizer = MemoizedChunkSizer::new(10, &sizer); + let text = "1234567890"; + for _ in 0..10 { + memoized_sizer.chunk_size(0, text); + memoized_sizer.clear_cache(); + } + + assert_eq!( + memoized_sizer.sizer.calls.load(atomic::Ordering::SeqCst), + 10 + ); + } } diff --git a/src/markdown.rs b/src/markdown.rs index b3bec9b..27061e8 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -195,8 +195,6 @@ enum SemanticLevel { Sentence, /// Single line break, which isn't necessarily a new element in Markdown SoftBreak, - /// A text node within an element - Text, /// An inline element that is within a larger element such as a paragraph, but /// more specific than a sentence. InlineElement(SemanticSplitPosition), @@ -222,7 +220,6 @@ impl SemanticLevel { | SemanticLevel::Word | SemanticLevel::Sentence | SemanticLevel::SoftBreak - | SemanticLevel::Text | SemanticLevel::Block | SemanticLevel::MetaContainer | SemanticLevel::Rule @@ -240,7 +237,6 @@ impl SemanticLevel { | SemanticLevel::Word | SemanticLevel::Sentence | SemanticLevel::SoftBreak - | SemanticLevel::Text | SemanticLevel::InlineElement(_) | SemanticLevel::Rule | SemanticLevel::Heading(_) @@ -363,13 +359,13 @@ impl SemanticSplit for Markdown { | Tag::Image { .. } | Tag::TableCell, ) + | Event::Text(_) | Event::HardBreak | Event::Code(_) | Event::InlineHtml(_) => Some(( SemanticLevel::InlineElement(SemanticSplitPosition::Own), range, )), - Event::Text(_) => Some((SemanticLevel::Text, range)), Event::FootnoteReference(_) => Some(( SemanticLevel::InlineElement(SemanticSplitPosition::Prev), range, @@ -409,8 +405,13 @@ impl SemanticSplit for Markdown { Self { ranges } } - fn ranges(&self) -> impl Iterator)> + '_ { - self.ranges.iter() + fn ranges_after_offset( + &self, + offset: usize, + ) -> impl Iterator)> + '_ { + self.ranges + .iter() + .filter(move |(_, sep)| sep.start >= offset) } /// Split a given text into iterator over each semantic chunk @@ -437,8 +438,7 @@ impl SemanticSplit for Markdown { SemanticLevel::Sentence => text .split_sentence_bound_indices() .map(move |(i, str)| (offset + i, str)), - SemanticLevel::Text - | SemanticLevel::SoftBreak + SemanticLevel::SoftBreak | SemanticLevel::InlineElement(_) | SemanticLevel::ContainerBlock(_) | SemanticLevel::Block @@ -447,7 +447,7 @@ impl SemanticSplit for Markdown { | SemanticLevel::Rule | SemanticLevel::Metadata => Self::split_str_by_separator( text, - self.ranges_after_offset(offset, semantic_level) + self.level_ranges_after_offset(offset, semantic_level) .map(move |(l, sep)| (*l, sep.start - offset..sep.end - offset)), ) .map(move |(i, str)| (offset + i, str)), @@ -468,6 +468,11 @@ impl SemanticSplit for Markdown { (offset + diff, chunk.trim()) } } + + /// Clear out ranges we have moved past so future iterations are faster + fn update_ranges(&mut self, cursor: usize) { + self.ranges.retain(|(_, range)| range.start >= cursor); + } } #[cfg(test)] @@ -577,7 +582,7 @@ mod tests { #[test] fn chunk_by_words() { - let text = "The quick (\"brown\") fox can't jump 32.3 feet, right?"; + let text = "The quick brown fox can jump 32.3 feet, right?"; let chunks = TextChunks::<_, _, Markdown>::new(10, &Characters, text, false) .map(|(_, w)| w) @@ -585,11 +590,10 @@ mod tests { assert_eq!( vec![ "The quick ", - "(\"brown\") ", - "fox can't ", - "jump 32.3 ", - "feet, ", - "right?" + "brown fox ", + "can jump ", + "32.3 feet,", + " right?" ], chunks ); @@ -654,9 +658,12 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..41), - &(SemanticLevel::Text, 0..41) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..41 + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -675,7 +682,10 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Next), 2..5 ), - &(SemanticLevel::Text, 6..21), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 6..21 + ), &( SemanticLevel::ContainerBlock(SemanticSplitPosition::Own), 22..42 @@ -684,9 +694,12 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Next), 24..27 ), - &(SemanticLevel::Text, 28..42), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 28..42 + ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -697,13 +710,16 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..12), - &(SemanticLevel::Text, 0..8), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..8 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Prev), 8..12 ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -719,7 +735,7 @@ mod tests { 0..6 ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -734,9 +750,12 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..10 ), - &(SemanticLevel::Text, 1..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 1..9 + ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -751,9 +770,12 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..12 ), - &(SemanticLevel::Text, 2..10), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..10 + ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -768,9 +790,12 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..12 ), - &(SemanticLevel::Text, 2..10), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..10 + ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -785,9 +810,12 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..11 ), - &(SemanticLevel::Text, 1..5), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 1..5 + ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -802,9 +830,12 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..12 ), - &(SemanticLevel::Text, 2..6), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..6 + ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -819,13 +850,16 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 0..6 ), - &(SemanticLevel::Text, 6..15), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 6..15 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 15..22 ), ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -841,7 +875,7 @@ mod tests { ), &(SemanticLevel::Block, 0..20) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -859,12 +893,18 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 1..11 ), - &(SemanticLevel::Text, 2..10), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..10 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 12..22 ), - &(SemanticLevel::Text, 13..21), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 13..21 + ), &( SemanticLevel::ContainerBlock(SemanticSplitPosition::Own), 38..57 @@ -873,14 +913,20 @@ mod tests { SemanticLevel::InlineElement(SemanticSplitPosition::Own), 39..47 ), - &(SemanticLevel::Text, 40..46), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 40..46 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 48..56 ), - &(SemanticLevel::Text, 49..55) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 49..55 + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -891,11 +937,17 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..26), - &(SemanticLevel::Text, 0..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..9 + ), &(SemanticLevel::SoftBreak, 9..10), - &(SemanticLevel::Text, 10..26) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 10..26 + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -906,14 +958,20 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..27), - &(SemanticLevel::Text, 0..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..9 + ), &( SemanticLevel::InlineElement(SemanticSplitPosition::Own), 9..11 ), - &(SemanticLevel::Text, 11..27) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 11..27 + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -925,9 +983,12 @@ mod tests { vec![ &(SemanticLevel::Block, 0..18), &(SemanticLevel::Block, 10..18), - &(SemanticLevel::Text, 10..18) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 10..18 + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -936,8 +997,14 @@ mod tests { let markdown = Markdown::new("```\ncode\n```"); assert_eq!( - vec![&(SemanticLevel::Block, 0..12), &(SemanticLevel::Text, 4..9)], - markdown.ranges().collect::>() + vec![ + &(SemanticLevel::Block, 0..12), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 4..9 + ) + ], + markdown.ranges_after_offset(0).collect::>() ); } @@ -952,9 +1019,12 @@ mod tests { 0..7 ), &(SemanticLevel::Block, 2..7), - &(SemanticLevel::Text, 2..7) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2..7 + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -965,12 +1035,18 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Block, 0..10), - &(SemanticLevel::Text, 0..9), + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 0..9 + ), &(SemanticLevel::Rule, 11..15), &(SemanticLevel::Block, 16..27), - &(SemanticLevel::Text, 16..27) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 16..27 + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } @@ -992,9 +1068,12 @@ mod tests { assert_eq!( vec![ &(SemanticLevel::Heading(level), 0..9 + index), - &(SemanticLevel::Text, 2 + index..9 + index) + &( + SemanticLevel::InlineElement(SemanticSplitPosition::Own), + 2 + index..9 + index + ) ], - markdown.ranges().collect::>() + markdown.ranges_after_offset(0).collect::>() ); } } @@ -1006,7 +1085,7 @@ mod tests { assert_eq!( vec![&(SemanticLevel::MetaContainer, 0..42),], markdown - .ranges_after_offset(0, SemanticLevel::MetaContainer) + .level_ranges_after_offset(0, SemanticLevel::MetaContainer) .collect::>() ); } @@ -1027,7 +1106,10 @@ mod tests { ), ], markdown - .ranges_after_offset(0, SemanticLevel::ContainerBlock(SemanticSplitPosition::Own)) + .level_ranges_after_offset( + 0, + SemanticLevel::ContainerBlock(SemanticSplitPosition::Own) + ) .collect::>() ); } diff --git a/src/text.rs b/src/text.rs index a953afb..19bca47 100644 --- a/src/text.rs +++ b/src/text.rs @@ -254,8 +254,13 @@ impl SemanticSplit for LineBreaks { } /// Retrieve ranges for all sections of a given level after an offset - fn ranges(&self) -> impl Iterator)> + '_ { - self.line_breaks.iter() + fn ranges_after_offset( + &self, + offset: usize, + ) -> impl Iterator)> + '_ { + self.line_breaks + .iter() + .filter(move |(_, sep)| sep.start >= offset) } /// Split a given text into iterator over each semantic chunk @@ -284,12 +289,17 @@ impl SemanticSplit for LineBreaks { .map(move |(i, str)| (offset + i, str)), SemanticLevel::LineBreak(_) => Self::split_str_by_separator( text, - self.ranges_after_offset(offset, semantic_level) + self.level_ranges_after_offset(offset, semantic_level) .map(move |(_, sep)| sep.start - offset..sep.end - offset), ) .map(move |(i, str)| (offset + i, str)), } } + + /// Clear out ranges we have moved past so future iterations are faster + fn update_ranges(&mut self, cursor: usize) { + self.line_breaks.retain(|(_, range)| range.start >= cursor); + } } #[cfg(test)] diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap index ba351de..9a85d53 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md-2.snap @@ -11,11 +11,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nThe point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\nThis paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:\n" - "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n This paragraph belongs to item two of the outer list.\n```\n\nThe AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document.\n\n" - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n" -- " lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n4. What is the exact rule for determining when list items get\n wrapped in `

` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n" -- "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n" -- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```\n\nIn the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away.\n\n" +- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n" +- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n" +- "4. What is the exact rule for determining when list items get\n wrapped in `

` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n" +- "11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```\n\n" +- "In the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away.\n\n" - "## About this document\n\nThis document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\nof representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer.\n" - "\nNote that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\nimplementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\npercent-encode non-ASCII characters in URLs.\n\nThis document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs.\n\n" - "# Preliminaries\n\n" @@ -118,10 +119,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n``` aa ```\nfoo\n.\n

aa\nfoo

\n````````````````````````````````\n\n\n[Info strings] for tilde code blocks can contain backticks and tildes:\n\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
foo\n
\n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:\n" - "\n```````````````````````````````` example\n```\n``` aaa\n```\n.\n
``` aaa\n
\n````````````````````````````````\n\n\n\n" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line.\n\n" -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n``, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n" -- "``.\n\n" -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n``, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n" +- "5. **Start condition:** line begins with the string\n``.\n\n" +- "6. " +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n"
 - "\n```````````````````````````````` example\n
\n
\n**Hello**,\n\n_world_.\n
\n
\n.\n
\n
\n**Hello**,\n

world.\n

\n
\n````````````````````````````````\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n" - "\nAll types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
\n hi\n
\n\nokay.\n.\n\n \n \n \n
\n hi\n
\n

okay.

\n````````````````````````````````" @@ -194,8 +196,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
\n
\n
\n

foo\nbar\nbaz

\n
\n
\n
\n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:\n" - "\n```````````````````````````````` example\n> code\n\n> not code\n.\n
\n
code\n
\n
\n
\n

not code

\n
\n````````````````````````````````\n\n\n\n" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:\n\n" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. " -- "If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\nFor example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

A paragraph\nwith two lines.

\n
indented code\n
\n
\n

A block quote.

\n
\n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n" +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n" +- "\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" +- "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

A paragraph\nwith two lines.

\n
indented code\n
\n
\n

A block quote.

\n
\n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n" - "\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
    \n
  1. \n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n
  2. \n
\n````````````````````````````````\n\n\nThe most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n" - "\nHere are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
    \n
  • one
  • \n
\n

two

\n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
    \n
  • \n

    one

    \n

    two

    \n
  • \n
\n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
    \n
  • one
  • \n
\n
 two\n
\n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
    \n
  • \n

    one

    \n

    two

    \n
  • \n
\n````````````````````````````````" @@ -284,12 +287,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nA [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" - " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n" -- " iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" +- "5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n" - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n" -- " `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\n" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n" +- "16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\n" - "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

foo bar

\n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

a * foo bar*

\n````````````````````````````````" - "\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

a*"foo"*

\n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

* a *

\n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n" - "\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

*$*alpha.

\n

*£*bravo.

\n

*€*charlie.

\n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

foobar

\n````````````````````````````````" @@ -491,5 +494,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter.\n\n" - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n" +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:\n\n " +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap index cd84420..e595aa0 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@commonmark_spec.md.snap @@ -3,8 +3,9 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/commonmark_spec.md --- -- "---\ntitle: CommonMark Spec\n" -- "author: John MacFarlane\nversion: '0.31.2'\n" +- "---\n" +- "title: CommonMark Spec\nauthor: John MacFarlane\n" +- "version: '0.31.2'\n" - "date: '2024-01-28'\n" - "license: '[CC-BY-SA 4.0](https" - "://creativecommons.org/licenses/by-sa/" @@ -17,8 +18,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "help from Aaron Swartz) and released in 2004 in the form of a\n" - "[syntax description](https://daringfireball.net/projects" - "/markdown/syntax)\nand a Perl script (" -- "`Markdown.pl`" -- ") for converting Markdown to\n" +- "`Markdown.pl`) for converting Markdown to\n" - "HTML. In the next decade, dozens of implementations were\n" - "developed in many languages. Some extended the original\n" - "Markdown syntax with conventions for footnotes, tables, and\n" @@ -31,8 +31,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "What distinguishes Markdown from many other lightweight markup\n" - "syntaxes, which are often easier to write, is its readability.\n" - "As Gruber writes:\n\n" -- "> The overriding design goal for Markdown's formatting syntax is\n" -- "> to make it as readable as possible. The idea is that a\n> " +- "> " +- "The overriding design goal for Markdown's formatting syntax is\n> " +- "to make it as readable as possible. The idea is that a\n> " - "Markdown-formatted document should be publishable as-is, as\n> " - "plain text, without looking like it's been marked up with tags\n> " - "or formatting instructions.\n> (" @@ -43,7 +44,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "asciidoc.org/) with\n" - "an equivalent sample of Markdown. Here is a sample of\n" - "AsciiDoc from the AsciiDoc manual:\n\n" -- "```\n1. List item one.\n+\n" +- "```\n" +- "1. List item one.\n+\n" - "List item one continued with a second paragraph followed by an\nIndented block.\n" - "+\n" - "................" @@ -57,7 +59,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This paragraph is part of the preceding list item.\n\nb. List item b.\n\n" - "This paragraph belongs to item two of the outer list.\n--\n```\n\n" - "And here is the equivalent in Markdown:\n" -- "```\n1. List item one.\n\n" +- "```\n" +- "1. List item one.\n\n" - " List item one continued with a second paragraph followed by an\n Indented block.\n\n" - " $ ls *.sh\n" - " $ mv *.sh ~/tmp\n\n" @@ -73,99 +76,109 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "to read. The nesting of list items is apparent to the eye in the\n" - "source, not just in the processed document.\n\n" - "## Why is a spec needed?\n\n" -- "John Gruber's [canonical description of Markdown's\n" -- "syntax](https://daringfireball.net/projects/" -- "markdown/syntax)\n" +- "John Gruber's " +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown" +- "/syntax)\n" - "does not specify the syntax unambiguously. Here are some examples of\n" - "questions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? " +- "1. " +- "How much indentation is needed for a sublist? " - "The spec says that\n " - "continuation paragraphs need to be indented four spaces, but is\n " - "not fully explicit about sublists. It is natural to think that\n " -- "they, too, must be indented four spaces, but `" -- "Markdown.pl` does\n not require that. This is hardly a \"" -- "corner case,\" and divergences\n " +- "they, too, must be indented four spaces, but " +- "`Markdown.pl` does\n not require that. This is hardly a " +- "\"corner case,\" and divergences\n " - "between implementations on this issue often lead to surprises for\n " - "users in real documents. (See " -- "[this comment by John\n" -- " Gruber](https://web.archive.org/web" -- "/20170611172104/http://" +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/" +- "20170611172104/http://" - article.gmane.org/ - "gmane.text.markdown.general/1997).)\n\n" -- "2. Is a blank line needed before a block quote or heading?\n" -- " Most implementations do not require the blank line. However,\n " +- "2. " +- "Is a blank line needed before a block quote or heading?\n " +- "Most implementations do not require the blank line. However,\n " - "this can lead to unexpected results in hard-wrapped text, and\n " - "also to ambiguities in parsing (note that some implementations\n " - "put the heading inside the blockquote, while others do not).\n " - "(John Gruber has also spoken " -- "[in favor of requiring the blank\n" -- " lines](https://web.archive.org/web/" +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ - "gmane.text.markdown.general/2146).)\n\n" -- "3. Is a blank line needed before an indented code block?\n" -- " (`Markdown.pl` requires it, but this is not mentioned " -- "in the\n documentation, and some implementations do not require it.)\n\n " +- "3. " +- "Is a blank line needed before an indented code block?\n (" +- "`Markdown.pl`" +- " requires it, but this is not mentioned in the\n " +- "documentation, and some implementations do not require it.)\n\n " - "``` markdown\n paragraph\n code?\n ```\n\n" -- "4. What is the exact rule for determining when list items get\n" -- " wrapped in `

`" -- " tags? Can a list be partially \"loose\" and partially\n \"tight\"" -- "? What should we do with a list like this?\n\n " +- "4. " +- "What is the exact rule for determining when list items get\n wrapped in " +- "`

` tags? Can a list be partially \"loose\"" +- " and partially\n \"tight\"? What should we do with a list like this?\n\n " - "``` markdown\n 1. one\n\n 2. two\n 3. three\n" - " ```\n\n Or this?\n" - "\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n" - " ```\n\n " -- "(There are some relevant comments by John Gruber\n" -- " [here](https://web.archive.org/web/" +- "(There are some relevant comments by John Gruber\n " +- "[here](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ - "gmane.text.markdown.general/2554).)\n\n" -- "5. Can list markers be indented? " +- "5. " +- "Can list markers be indented? " - "Can ordered list markers be right-aligned?\n\n " -- "``` markdown\n 8. item 1\n 9. item 2\n" -- " 10. item 2a\n ```\n\n" -- "6. Is this one list with a thematic break in its second item,\n" -- " or two lists separated by a thematic break?\n\n " +- "``` markdown\n 8. item 1\n 9. item 2\n " +- "10. item 2a\n ```\n\n" +- "6. " +- "Is this one list with a thematic break in its second item,\n " +- "or two lists separated by a thematic break?\n\n " - "``` markdown\n * a\n * * * * *\n * b\n" - " ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n" -- " two lists or one? (The Markdown syntax description suggests two,\n " +- "7. " +- "When list markers change from numbers to bullets, do we have\n " +- "two lists or one? (The Markdown syntax description suggests two,\n " - "but the perl scripts and many other implementations produce one.)\n\n " -- "``` markdown\n 1. fee\n 2. fie\n - foe\n" -- " - fum\n ```\n\n" -- "8. What are the precedence rules for the markers of inline structure?\n" -- " For example, is the following a valid link, or does the code span\n " +- "``` markdown\n 1. fee\n 2. fie\n - foe\n " +- "- fum\n ```\n\n" +- "8. " +- "What are the precedence rules for the markers of inline structure?\n " +- "For example, is the following a valid link, or does the code span\n " - "take precedence ?\n\n " -- "``` markdown\n" -- " [a backtick (`)](/url) and [another " +- "``` markdown\n " +- "[a backtick (`)](/url) and [another " - "backtick (`)](/url).\n ```\n\n" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? " -- "For example, how should the following be parsed?\n\n " +- "9. " +- "What are the precedence rules for markers of emphasis and strong\n " +- "emphasis? For example, how should the following be parsed?\n\n " - "``` markdown\n *foo *bar* baz*\n" - " ```\n\n" -- "10. What are the precedence rules between block-level and inline-level\n" -- " structure? For example, how should the following be parsed?\n\n " -- "``` markdown\n" -- " - `a long code span can contain a hyphen like this\n " +- "10. " +- "What are the precedence rules between block-level and inline-level\n " +- "structure? For example, how should the following be parsed?\n\n " +- "``` markdown\n " +- "- `a long code span can contain a hyphen like this\n " - " - and it can screw things up`\n ```\n\n" -- "11. Can list items include section headings? " -- "(`Markdown.pl`" +- "11. " +- "Can list items include section headings? (`Markdown.pl`" - " does not\n " - "allow this, but does allow blockquotes to include headings.)\n\n " - "``` markdown\n - # Heading\n ```\n\n" -- "12. Can list items be empty?\n\n ``` markdown\n * a\n *\n" -- " * b\n ```\n\n" -- "13. Can link references be defined inside block quotes or list items?\n\n" -- " ``` markdown\n > Blockquote [foo].\n >\n" -- " > [foo]: /url\n ```\n\n" -- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n" -- " ``` markdown\n [foo]: /url1\n" -- " [foo]: /url2\n\n [foo][]\n" +- "12. Can list items be empty?\n" +- "\n ``` markdown\n * a\n *\n * b\n ```\n\n" +- "13. Can link references be defined inside block quotes or list items?\n" +- "\n ``` markdown\n > Blockquote [foo].\n >\n " +- "> [foo]: /url\n ```\n\n" +- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n" +- "\n ``` markdown\n [foo]: /url1\n " +- "[foo]: /url2\n\n [foo][]\n" - " ```\n\n" -- "In the absence of a spec, early implementers consulted `" -- "Markdown.pl`\nto resolve these ambiguities. But " -- "`Markdown.pl`" -- " was quite buggy, and\n" +- "In the absence of a spec, early implementers consulted " +- "`Markdown.pl`\nto resolve these ambiguities. But " +- "`Markdown.pl` was quite buggy, and\n" - "gave manifestly bad results in many cases, so it was not a\n" - "satisfactory replacement for a spec.\n\n" - "Because there is no unambiguous spec, implementations have diverged\n" @@ -174,14 +187,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "GitHub wiki)\n" - "renders differently on another (say, converting to docbook using\n" - "pandoc). To make matters worse, because nothing in Markdown counts\n" -- "as a \"syntax error,\" the divergence often isn't discovered right " -- "away.\n\n" +- "as a \"syntax error,\" the divergence often isn'" +- "t discovered right away.\n\n" - "## About this document\n\n" - "This document attempts to specify Markdown syntax unambiguously.\n" - "It contains many examples with side-by-side Markdown and\n" - "HTML. These are intended to double as conformance tests. An\n" -- "accompanying script `spec_tests.py` can be used to run " -- "the tests\nagainst any Markdown program:\n\n " +- "accompanying script `spec_tests.py`" +- " can be used to run the tests\nagainst any Markdown program:\n\n " - "python test/spec_tests.py --spec " - "spec.txt --program PROGRAM\n\n" - "Since this document describes how Markdown is to be parsed into\n" @@ -200,68 +213,68 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "non-ASCII characters in URLs). " - "But a conforming implementation\ncan use a different renderer and may choose not to\n" - "percent-encode non-ASCII characters in URLs.\n\n" -- "This document is generated from a text file, `spec.txt" -- "`, written\n" +- "This document is generated from a text file, " +- "`spec.txt`, written\n" - "in Markdown with a small extension for the side-by-side tests.\n" - "The script `tools/makespec.py` can be used to convert " -- "`spec.txt`" -- " into\n" +- "`spec.txt` into\n" - HTML or CommonMark (which can then be converted into other formats - ").\n\n" - "In the examples, the `→` character is used to represent tabs.\n\n" - "# Preliminaries\n\n" - "## Characters and lines\n\n" - "Any sequence of [characters] is a valid CommonMark\ndocument.\n" -- "\nA [character](@) is a Unicode code point. " -- "Although some\ncode points (for example, combining accents) do not correspond to\n" +- "\nA [character](@)" +- " is a Unicode code point. Although some\n" +- "code points (for example, combining accents) do not correspond to\n" - "characters in an intuitive sense, all code points count as characters\n" - "for purposes of this spec.\n\n" -- "This spec does not specify an encoding; it thinks of lines as composed\n" -- "of [characters]" -- " rather than bytes. A conforming parser may be limited\n" +- "This spec does not specify an encoding; it thinks of lines as composed\nof " +- "[characters] rather than bytes. A conforming parser may be limited\n" - "to a certain encoding.\n\n" - "A [line](@) is a sequence of zero or more [characters" - "]\nother than line feed (`U+000A`" -- ") or carriage return (`U+000D`" -- "),\nfollowed by a [line ending] or by the end of file.\n\n" -- "A [line ending](@) is a line feed (`U+" -- "000A`), a carriage return\n(`U+000D`" +- ") or carriage return (`U+000D`),\nfollowed by a " +- "[line ending] or by the end of file.\n\n" +- "A [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`" - ") not followed by a line feed, or a carriage return and a\n" - "following line feed.\n\n" -- "A line containing no characters, or a line containing only spaces\n" -- "(`U+0020`) or tabs (`U+" -- "0009`), is called a [blank line](@).\n\n" +- "A line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (" +- "`U+0009`), is called a " +- "[blank line](@).\n\n" - "The following definitions of character classes will be used in this spec:\n" -- "\n" -- "A [Unicode whitespace character](@) is a character in " -- "the Unicode `Zs` general\ncategory, or a tab (" -- "`U+0009`), line feed (`U+000A" -- "`), form feed (`U+000C`" -- "), or\ncarriage return (`U+000D`).\n\n" -- "[Unicode whitespace](@) is a sequence of one or " -- "more\n[Unicode whitespace characters].\n\n" +- "\nA [Unicode whitespace character](@)" +- " is a character in the Unicode `Zs` general\n" +- "category, or a tab (`U+0009`" +- "), line feed (`U+000A`), form feed (" +- "`U+000C`), or\ncarriage return (" +- "`U+000D`).\n\n" +- "[Unicode whitespace](@)" +- " is a sequence of one or more\n[Unicode whitespace characters].\n\n" - "A [tab](@) is `U+0009`.\n" - "\nA [space](@) is `U+0020`.\n" -- "\n" -- "An [ASCII control character](@) is a character between `" -- "U+0000–1F` (both\nincluding) or " +- "\nAn [ASCII control character](@) is a character between " +- "`U+0000–1F` (both\nincluding) or " - "`U+007F`.\n\n" -- "An [ASCII punctuation character](@)\n" -- "is `!`, `\"`, `#`, `$`" -- ", `%`, `&`, `'`, `(`" -- ", `)`,\n`*`, `+`, `,`" -- ", `-`, `.`, `/`" +- "An [ASCII punctuation character](@)\nis " +- "`!`, `\"`, `#`, `$`, " +- "`%`, `&`, `'`, `(`, " +- "`)`,\n`*`, `+`, `,`, " +- "`-`, `.`, `/`" - " (U+0021–2F), \n`:`, " - "`;`, `<`, `=`, `>`, " - "`?`, `@`" - " (U+003A–0040),\n`[`, " - "`\\`, `]`, `^`, `_`, " - "`` ` `` (U+005B–0060), \n" -- "`{`, `|`, `}`, or `~` " -- "(U+007B–007E).\n\n" -- "A [Unicode punctuation character](@) is a " -- "character in the Unicode `P`\n(puncuation) or " -- "`S` (symbol) general categories.\n\n" +- "`{`, `|`, `}`, or `~`" +- " (U+007B–007E).\n\n" +- "A [Unicode punctuation character](@)" +- " is a character in the Unicode `P`\n" +- "(puncuation) or `S` (symbol) general categories.\n\n" - "## Tabs\n\n" - "Tabs in lines are not expanded to [spaces]. However,\n" - "in contexts where spaces help to define block structure,\n" @@ -316,12 +329,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n" - "Normally the `>` that begins a block quote may be followed\n" - "optionally by a space, which is not considered part of the\n" -- "content. In the following case `>`" -- " is followed by a tab,\n" +- "content. In the following case `>` is followed by a tab,\n" - "which is treated as if it were expanded into three spaces.\n" - "Since one of these spaces is considered part of the\ndelimiter, " -- "`foo`" -- " is considered to be indented six spaces\n" +- "`foo` is considered to be indented six spaces\n" - "inside the block quote context, so we get an indented\n" - "code block starting with two spaces.\n\n" - "````````````````" @@ -369,8 +380,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Insecure characters\n\n" -- "For security reasons, the Unicode character `U+0000` must " -- "be replaced\nwith the REPLACEMENT CHARACTER (" +- "For security reasons, the Unicode character `U+0000`" +- " must be replaced\n" +- with the REPLACEMENT CHARACTER ( - "`U+FFFD`).\n\n\n" - "## Backslash escapes\n\n" - "Any ASCII punctuation character may be backslash-escaped:\n" @@ -513,18 +525,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Valid HTML entity references and numeric character references\n" - "can be used in place of the corresponding Unicode character,\n" - "with the following exceptions:\n\n" -- "- Entity and character references are not recognized in code\n" -- " blocks and code spans.\n\n" -- "- Entity and character references cannot stand in place of\n" -- " special characters that define structural elements in\n CommonMark. For example, although " +- "- " +- "Entity and character references are not recognized in code\n blocks and code spans.\n\n" +- "- " +- "Entity and character references cannot stand in place of\n " +- "special characters that define structural elements in\n CommonMark. For example, although " - "`*` can be used\n in place of a literal " - "`*` character, `*` cannot replace\n `*`" - " in emphasis delimiters, bullet list markers, or thematic\n breaks.\n\n" - "Conforming CommonMark parsers need not store information about\n" - "whether a particular character was represented in the source\n" - "using a Unicode character or an entity reference.\n\n" -- "[Entity references](@) consist of `&` + any " -- "of the valid\nHTML5 entity names + `;`. The\ndocument " +- "[Entity references](@) consist of `&`" +- " + any of the valid\nHTML5 entity names + `;`" +- ". The\ndocument " - "\nis used as an authoritative source for the valid entity\n" - "references and their corresponding code points.\n\n" @@ -539,16 +553,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "∲ ≧̸

\n" - "````````````````" - "````````````````\n\n\n" -- "[Decimal numeric character\nreferences](@)\n" -- "consist of `&#` + a string of 1--7 " -- "arabic digits + `;`" -- ". A\nnumeric character reference is parsed as the corresponding\n" +- "[Decimal numeric character\nreferences](@)\nconsist of " +- "`&#` + a string of 1--" +- "7 arabic digits + `;`. A\n" +- "numeric character reference is parsed as the corresponding\n" - "Unicode character. " - "Invalid Unicode code points will be replaced by\n" -- "the REPLACEMENT CHARACTER (`U+" -- "FFFD`). For security reasons,\nthe code point " -- "`U+0000` will also be replaced by `U+" -- "FFFD`.\n\n" +- the REPLACEMENT CHARACTER ( +- "`U+FFFD`). For security reasons,\nthe code point " +- "`U+0000` will also be replaced by " +- "`U+FFFD`.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -556,10 +570,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "0;\n.\n

# Ӓ Ϡ �

\n" - "````````````````" - "````````````````\n\n\n" -- "[Hexadecimal numeric character\n" -- "references](@) consist of `&#` +\neither `X`" -- " or `x` + a string of 1-6 hexadecimal digits " -- "+ `;`" +- "[Hexadecimal numeric character\nreferences](@) consist of " +- "`&#` +\neither `X` or `x`" +- " + a string of 1-6 hexadecimal digits + `;`" - ".\nThey too are parsed as the corresponding Unicode character (this\n" - time specified with a hexadecimal numeral instead of decimal) - ".\n\n" @@ -588,8 +601,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Although HTML5 does accept some entity references\n" -- "without a trailing semicolon (such as `©`), these " -- "are not\nrecognized here, because it makes the grammar too ambiguous:\n\n" +- "without a trailing semicolon (such as `©`" +- "), these are not\nrecognized here, because it makes the grammar too ambiguous:\n\n" - "````````````````" - "```````````````` " - "example\n©\n.\n

&copy

\n" @@ -707,18 +720,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "# Blocks and inlines\n\n" -- "We can think of a document as a sequence of\n" -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block\n" - "quotations, lists, headings, rules, and code blocks. " - "Some blocks (like\n" - "block quotes and list items) contain other blocks; others (like\n" -- "headings and paragraphs) contain [inline](@) content-" -- "--text,\n" +- "headings and paragraphs) contain [inline](@) content" +- "---text,\n" - "links, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\n" -- "Indicators of block structure always take precedence over indicators\nof inline structure. " -- "So, for example, the following is a list with\n" +- "Indicators of block structure always take precedence over indicators\n" +- "of inline structure. So, for example, the following is a list with\n" - "two items, not a list with one item containing a code span:\n\n" - "````````````````" - "```````````````` " @@ -738,8 +750,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one block element does not affect the inline parsing of any other.\n\n" - "## Container blocks and leaf blocks\n\n" - "We can divide blocks into two types:\n" -- "[container blocks](#container-blocks)" -- ",\nwhich can contain other blocks, and " +- "[container blocks](#container-blocks),\n" +- "which can contain other blocks, and " - "[leaf blocks](#leaf-blocks),\nwhich cannot.\n\n" - "# Leaf blocks\n\n" - "This section describes the different kinds of leaf block that make up a\n" @@ -747,8 +759,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Thematic breaks\n\n" - "A line consisting of optionally up to three spaces of indentation, followed " - "by a\nsequence of three or more matching `-`, `_`" -- ", or `*`" -- " characters, each followed\n" +- ", or `*` characters, each followed\n" - "optionally by any number of spaces or tabs, forms a\n" - "[thematic break](@).\n\n" - "````````````````" @@ -919,8 +930,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## ATX headings\n\n" - "An [ATX heading](@)\n" - "consists of a string of characters, parsed as inline content, between an\n" -- "opening sequence of 1--6 unescaped `#` characters and an " -- "optional\nclosing sequence of any number of unescaped `#`" +- "opening sequence of 1--6 unescaped `#`" +- " characters and an optional\nclosing sequence of any number of unescaped `#`" - " characters.\nThe opening sequence of `#`" - " characters must be followed by spaces or tabs, or\n" - "by the end of line. The optional closing sequence of `#`" @@ -956,8 +967,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

####### foo

\n" - "````````````````" - "````````````````\n\n\n" -- "At least one space or tab is required between the `#` characters and " -- "the\nheading's contents, unless the heading is empty. Note that many\n" +- "At least one space or tab is required between the `#`" +- " characters and the\nheading'" +- "s contents, unless the heading is empty. Note that many\n" - "implementations currently do not require the space. However, the\n" - "space was required by the\n" - "[original ATX implementation](http://" @@ -1128,13 +1140,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "thematic breaks],\n[list item][list items], or [" - "HTML block][HTML blocks].\n\n" - "A [setext heading underline](@) is a sequence of\n" -- "`=` characters or a sequence of `-` characters, with no more " -- "than 3\n" +- "`=` characters or a sequence of `-`" +- " characters, with no more than 3\n" - "spaces of indentation and any number of trailing spaces or tabs.\n\n" -- "The heading is a level 1 heading if `=` characters are used in\n" -- "the [setext heading underline], and a level 2 heading if " -- "`-`" -- "\ncharacters are used. The contents of the heading are the result\n" +- "The heading is a level 1 heading if `=` characters are used in\nthe " +- "[setext heading underline], and a level 2 heading if " +- "`-`\ncharacters are used. The contents of the heading are the result\n" - "of parsing the preceding lines of text as CommonMark inline\ncontent.\n\n" - "In general, a setext heading need not be preceded or followed by a\n" - "blank line. However, it cannot interrupt a paragraph, so when a\n" @@ -1267,8 +1278,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

\n" - "````````````````" - "````````````````\n\n\n" -- "The setext heading underline cannot be a [lazy continuation\n" -- "line] in a list item or block quote:\n\n" +- "The setext heading underline cannot be a [lazy continuation\nline]" +- " in a list item or block quote:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1355,8 +1366,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
\n" - "````````````````" - "````````````````\n\n\n" -- "If you want a heading with `> foo` as its literal text, " -- "you can\nuse backslash escapes:\n\n" +- "If you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1417,10 +1428,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Indented code blocks\n\n" -- "An [indented code block](@) is composed of one or " -- "more\n[indented chunks] separated by blank lines.\nAn " -- "[indented chunk](@) is a sequence of non-blank " -- "lines,\n" +- "An [indented code block](@)" +- " is composed of one or more\n[indented chunks]" +- " separated by blank lines.\nAn [indented chunk](@)" +- " is a sequence of non-blank lines,\n" - "each preceded by four or more spaces of indentation. " - "The contents of the code\nblock are the literal contents of the lines, including trailing\n" - "[line endings], minus four spaces of indentation.\n" @@ -1544,11 +1555,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n\n" - "## Fenced code blocks\n\n" - "A [code fence](@) is a sequence\n" -- "of at least three consecutive backtick characters (`` ` ``" -- ") or\ntildes (`~`" +- "of at least three consecutive backtick characters (`` ` ``) or\n" +- "tildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA " -- "[fenced code block](@)" -- "\n" +- "[fenced code block](@)\n" - "begins with a code fence, preceded by up to three spaces of indentation" - ".\n\n" - "The line with the opening code fence may optionally contain some text\n" @@ -1559,8 +1569,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters. (The reason for this restriction is that otherwise\n" - "some inline code would be incorrectly interpreted as the\n" - "beginning of a fenced code block.)\n\n" -- "The content of the code block consists of all subsequent lines, until\n" -- "a closing [code fence] of the same type as the code block\n" +- "The content of the code block consists of all subsequent lines, until\na closing [" +- "code fence] of the same type as the code block\n" - "began with (backticks or tildes), and with at least as " - "many backticks\n" - "or tildes as the opening code fence. If the leading code fence is\n" @@ -1843,8 +1853,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo

\n" - "````````````````" - "````````````````\n\n\n" -- "[Info strings] for tilde code blocks can contain backticks and " -- "tildes:\n\n" +- "[Info strings]" +- " for tilde code blocks can contain backticks and tildes:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -1865,69 +1875,74 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## HTML blocks\n\n" -- "An [HTML block](@) is a group of lines that " -- "is treated\n" +- "An [HTML block](@)" +- " is a group of lines that is treated\n" - as raw HTML (and will not be escaped in HTML output - ").\n\n" -- "There are seven kinds of [HTML block], which can be defined " -- "by their\nstart and end conditions. The block begins with a line that meets a\n" -- "[start condition](@) (after up to three optional spaces of " -- "indentation).\nIt ends with the first subsequent line that meets a matching\n" -- "[end condition](@), or the last line of the document, " -- "or the last line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the " -- "[end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line.\n\n" +- "There are seven kinds of [HTML block]" +- ", which can be defined by their\n" +- "start and end conditions. The block begins with a line that meets a\n" +- "[start condition](@)" +- " (after up to three optional spaces of indentation).\n" +- "It ends with the first subsequent line that meets a matching\n" +- "[end condition](@)" +- ", or the last line of the document, or the last line of\nthe " +- "[container block](#container-blocks) containing the current HTML\n" +- "block, if no line is encountered that meets the [end condition]. If\n" +- "the first line meets both the [start condition] and the [end\ncondition]" +- ", the block will contain just that line.\n\n" - "1. " - "**Start condition:** line begins with the string ``" +- "``" - ", or the end of the line.\\\n**End condition:**" - " line contains an end tag\n`
`, " -- "``, ``, or `" -- "` (case-insensitive; it\n" +- "``, ``, or " +- "`` (case-insensitive; it\n" - "need not match the start tag).\n\n" - "2. " - "**Start condition:** line begins with the string ``.\n\n" - "3. " -- "**Start condition:** line begins with the string ``.\n\n" - "4. " - "**Start condition:** line begins with the string ``.\n\n" -- "5. **Start condition:** line begins with the string\n" +- "5. " +- "**Start condition:** line begins with the string\n" - "``" - ".\n\n" - "6. " - "**Start condition:** line begins with the string `<` or " -- "``, or\nthe string `/>`.\\\n" - "**End condition:** line is followed by a [blank line]" @@ -1935,13 +1950,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "7. " - "**Start condition:** line begins with a complete [open tag]\n" - "(with any [tag name] other than `pre`, `script`" -- ",\n`style`, or `textarea`" -- ") or a complete [closing tag],\n" +- ",\n`style`, or `textarea`) or a complete [" +- "closing tag],\n" - "followed by zero or more spaces and tabs, followed by the end of the " -- "line.\\\n**End condition:**" -- " line is followed by a [blank line].\n\n" -- "HTML blocks continue until they are closed by their appropriate\n" -- "[end condition], or the last line of the document or other " +- "line.\\\n**End condition:** line is followed by a [" +- "blank line].\n\n" +- "HTML blocks continue until they are closed by their appropriate\n[end condition]" +- ", or the last line of the document or other " - "[container\nblock](#container-blocks)" - ". This means any HTML " - "**within an HTML\nblock**" @@ -1949,8 +1964,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "be ignored by the parser and passed through as-is, without changing\n" - "the parser's state.\n\n" - "For instance, `
` within an HTML block started by "
-- "``"
-- " will not affect\n"
+- "`
` will not affect\n" - "the parser state; as the HTML block was started in by start " - "condition 6, it\nwill end at any blank line. This can be surprising:\n\n" - "````````````````" @@ -1967,8 +1981,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n" - "In this case, the HTML block is terminated by the blank line — " -- "the `**Hello**`" -- "\n" +- "the `**Hello**`\n" - "text remains verbatim — and regular parsing resumes, with a paragraph" - ",\nemphasised `world` and inline and block HTML following.\n\n" - "All types of [HTML blocks] except type 7 may interrupt\n" @@ -2102,8 +2115,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "int x = 33;\n```\n" - "````````````````" - "````````````````\n\n\n" -- "To start an [HTML block] with a tag that is *not" -- "* in the\n" +- "To start an [HTML block] with a tag that is " +- "*not* in the\n" - "list of block-level tags in (6), you must put the tag " - "by\nitself on the first line (and it must be complete):\n\n" - "````````````````" @@ -2141,8 +2154,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`` tag is a nice example. We can surround content with\n" - "``" - " tags in three different ways. In this case, we get a raw\n" -- "HTML block, because the `` tag is on a " -- "line by itself:\n\n" +- "HTML block, because the ``" +- " tag is on a line by itself:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2150,9 +2163,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo*\n\n" - "````````````````" - "````````````````\n\n\n" -- "In this case, we get a raw HTML block that just includes\n" -- "the ``" -- " tag (because it ends with the following blank\n" +- "In this case, we get a raw HTML block that just includes\nthe " +- "`` tag (because it ends with the following blank\n" - "line). So the contents get interpreted as CommonMark:\n\n" - "````````````````" - "```````````````` " @@ -2162,8 +2174,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "Finally, in this case, the `` tags are interpreted\n" -- "as [raw HTML] *inside*" +- "Finally, in this case, the `` tags are interpreted\nas " +- "[raw HTML] *inside*" - " the CommonMark paragraph. (Because\n" - "the tag is not on a line by itself, we get inline " - "HTML\nrather than an [HTML block].)\n\n" @@ -2175,10 +2187,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/del>

\n" - "````````````````" - "````````````````\n\n\n" -- "HTML tags designed to contain literal content\n" -- "(`pre`, `script`, `style`, `" -- "textarea`), comments, processing instructions,\n" -- "and declarations are treated somewhat differently.\n" +- "HTML tags designed to contain literal content\n(`pre`, " +- "`script`, `style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently.\n" - "Instead of ending at the first blank line, these blocks\n" - "end at the first line containing a corresponding end tag.\n" - "As a result, these blocks can contain blank lines:\n\n" @@ -2283,8 +2294,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

baz

\n" - "````````````````" - "````````````````\n\n\n" -- "Note that anything on the last line after the\n" -- "end tag will be included in the [HTML block]:\n\n" +- "Note that anything on the last line after the\nend tag will be included in the " +- "[HTML block]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2355,8 +2366,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "An HTML block of types 1--6 can interrupt a paragraph, " -- "and need not be\npreceded by a blank line.\n\n" +- An HTML block of types 1-- +- "6 can interrupt a paragraph, and need not be\npreceded by a blank line.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2387,22 +2398,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "This rule differs from John Gruber's original Markdown syntax\n" - "specification, which says:\n\n" -- "> The only restrictions are that block-level HTML elements —\n" -- "> e.g. `
`, `
" -- "`, `
`, `

`, etc. " -- "— must be separated from\n> " +- "> " +- "The only restrictions are that block-level HTML elements —\n> " +- "e.g. `

`, `
`" +- ", `
`, `

`" +- ", etc. — must be separated from\n> " - "surrounding content by blank lines, and the start and end tags of the\n> " - "block should not be indented with spaces or tabs.\n\n" -- "In some ways Gruber's rule is more restrictive than the one " -- "given\nhere:\n\n" +- "In some ways Gruber'" +- "s rule is more restrictive than the one given\nhere:\n\n" - "- It requires that an HTML block be preceded by a blank line.\n" - "- It does not allow the start tag to be indented.\n" -- "- It requires a matching end tag, which it also does not allow to\n" -- " be indented.\n\n" -- "Most Markdown implementations (including some of Gruber's own) " -- "do not\nrespect all of these restrictions.\n\n" -- "There is one respect, however, in which Gruber's rule is " -- "more liberal\nthan the one given here, since it allows blank lines to occur inside\n" +- "- It requires a matching end tag, which it also does not allow to\n " +- "be indented.\n\n" +- "Most Markdown implementations (including some of Gruber'" +- "s own) do not\nrespect all of these restrictions.\n\n" +- "There is one respect, however, in which Gruber'" +- "s rule is more liberal\n" +- "than the one given here, since it allows blank lines to occur inside\n" - "an HTML block. " - "There are two reasons for disallowing them here.\n" - "First, it removes the need to parse balanced tags, which is\n" @@ -2431,14 +2444,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "Some Markdown implementations have adopted a convention of\n" - "interpreting content inside tags as text if the open tag has\nthe attribute " -- "`markdown=1`" -- ". The rule given above seems a simpler and\n" +- "`markdown=1`. The rule given above seems a simpler and\n" - "more elegant way of achieving the same expressive power, which is also\n" - "much simpler to parse.\n\n" - "The main potential drawback is that one can no longer paste HTML\n" - "blocks into Markdown documents with 100% reliability. However,\n" -- "*in most cases*" -- " this will work fine, because the blank lines in\n" +- "*in most cases* this will work fine, because the blank lines in\n" - "HTML are usually followed by HTML block tags. For example:\n\n" - "````````````````" - "```````````````` " @@ -2450,8 +2461,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "There are problems, however, if the inner tags are indented\n" -- "*and*" -- " separated by spaces, as then they will be interpreted as\n" +- "*and* separated by spaces, as then they will be interpreted as\n" - "an indented code block:\n\n" - "````````````````" - "```````````````` " @@ -2465,15 +2475,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

\n" - "````````````````" - "````````````````\n\n\n" -- "Fortunately, blank lines are usually not necessary and can be\ndeleted. " -- "The exception is inside `
`"
-- " tags, but as described\n[above][HTML blocks]"
+- "Fortunately, blank lines are usually not necessary and can be\n"
+- "deleted.  The exception is inside `
` tags, but as described\n"
+- "[above][HTML blocks]"
 - ", raw HTML blocks starting with `
`\n*can*"
 - " contain blank lines.\n\n"
 - "## Link reference definitions\n\n"
-- "A [link reference definition](@)\n"
-- "consists of a [link label], optionally preceded by up to three spaces "
-- "of\nindentation, followed\nby a colon (`:`"
+- "A [link reference definition](@)\nconsists of a [link label]"
+- ", optionally preceded by up to three spaces of\nindentation, followed\n"
+- "by a colon (`:`"
 - "), optional spaces or tabs (including up to one\n[line ending]"
 - "), a [link destination],\n"
 - "optional spaces or tabs (including up to one\n[line ending]"
@@ -2773,8 +2783,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
 - "````````````````\n\n\n"
 - "## Paragraphs\n\n"
 - "A sequence of non-blank lines that cannot be interpreted as other\n"
-- "kinds of blocks forms a [paragraph](@)"
-- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
+- "kinds of blocks forms a [paragraph](@).\n"
+- "The contents of the paragraph are the result of parsing the\nparagraph'"
 - "s raw content as inlines.  The paragraph's raw content\n"
 - "is formed by concatenating the lines and removing initial and final\n"
 - "spaces or tabs.\n\nA simple example with two paragraphs:\n"
@@ -2862,51 +2872,52 @@ input_file: tests/inputs/markdown/commonmark_spec.md
 - "````````````````"
 - "````````````````\n\n\n\n"
 - "# Container blocks\n\n"
-- "A [container block](#container-blocks) is a block that has "
-- "other\nblocks as its contents.  There are two basic kinds of container blocks:\n["
+- "A [container block](#container-blocks)"
+- " is a block that has other\n"
+- "blocks as its contents.  There are two basic kinds of container blocks:\n["
 - "block quotes] and [list items].\n[Lists]"
 - " are meta-containers for [list items].\n\n"
 - "We define the syntax for container blocks recursively.  The general\n"
 - "form of the definition is:\n\n"
-- "> If X is a sequence of blocks, then the result of\n"
-- "> transforming X in such-and-such a way is a container of type "
-- "Y\n> with these blocks as its content.\n\n"
+- "> "
+- "If X is a sequence of blocks, then the result of\n> "
+- transforming X in such-and-such a way is a container of type Y
+- "\n> with these blocks as its content.\n\n"
 - "So, we explain what counts as a block quote or list item by explaining\n"
-- how these can be *generated*
-- " from their contents. This should suffice\n"
-- "to define the syntax, although it does not give a recipe for *parsing"
-- "*\nthese constructions.  (A recipe is provided below in the section entitled\n"
+- "how these can be *generated* from their contents. This should suffice\n"
+- "to define the syntax, although it does not give a recipe for "
+- "*parsing*\n"
+- "these constructions.  (A recipe is provided below in the section entitled\n"
 - "[A parsing strategy](#appendix-a-parsing"
 - "-strategy).)\n\n"
 - "## Block quotes\n\n"
 - "A [block quote marker](@),\n"
 - "optionally preceded by up to three spaces of indentation,\n"
-- "consists of (a) the character `>`"
-- " together with a following space of\n"
-- "indentation, or (b) a single character `>` not followed "
-- "by a space of\nindentation.\n\n"
+- "consists of (a) the character `>` together with a following space of\n"
+- "indentation, or (b) a single character `>`"
+- " not followed by a space of\nindentation.\n\n"
 - "The following rules define [block quotes]:\n\n"
-- 1.  **Basic case.
-- "**  If a string of lines *Ls* constitute a sequence\n    of blocks "
-- "*Bs*"
+- "1.  "
+- "**Basic case.**  If a string of lines *Ls*"
+- " constitute a sequence\n    of blocks *Bs*"
 - ", then the result of prepending a [block quote\n    marker]"
 - " to the beginning of each line in *Ls*\n    is a "
 - "[block quote](#block-quotes) containing *Bs*.\n\n"
-- 2.  **Laziness.
-- "**  If a string of lines *Ls* constitute a "
-- "[block\n    quote](#block-quotes) with contents *Bs*"
-- ", then the result of deleting\n    the initial [block quote marker]"
-- " from one or\n    "
+- "2.  "
+- "**Laziness.**  If a string of lines "
+- "*Ls* constitute a [block\n    quote](#block-quotes)"
+- " with contents *Bs*, then the result of deleting\n    the initial ["
+- "block quote marker] from one or\n    "
 - "more lines in which the next character other than a space or tab after the\n    "
 - "[block quote marker] is [paragraph continuation\n    text] is a block quote with "
 - "*Bs* as its content.\n    "
-- "[Paragraph continuation text](@)"
-- " is text\n    "
+- "[Paragraph continuation text](@) is text\n    "
 - "that will be parsed as part of the content of a paragraph, but does\n    "
 - "not occur at the beginning of the paragraph.\n\n"
-- 3.  **Consecutiveness.
-- "**  A document cannot contain two [block\n    quotes]"
-- " in a row unless there is a [blank line] between them.\n\n"
+- "3.  "
+- "**Consecutiveness.**  A document cannot contain two ["
+- "block\n    quotes] in a row unless there is a [blank line]"
+- " between them.\n\n"
 - "Nothing else counts as a [block quote](#block-quotes).\n"
 - "\nHere is a simple example:\n"
 - "\n"
@@ -2928,8 +2939,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
 - "baz

\n\n" - "````````````````" - "````````````````\n\n\n" -- "The `>` characters can be preceded by up to three spaces of " -- "indentation:\n\n" +- "The `>`" +- " characters can be preceded by up to three spaces of indentation:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2949,8 +2960,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
\n" - "````````````````" - "````````````````\n\n\n" -- "The Laziness clause allows us to omit the `>` before\n" -- "[paragraph continuation text]:\n\n" +- "The Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -2996,8 +3007,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • bar
  • \n\n" - "````````````````" - "````````````````\n\n\n" -- "For the same reason, we can't omit the `> ` in " -- "front of\nsubsequent lines of an indented or fenced code block:\n\n" +- "For the same reason, we can't omit the `> `" +- " in front of\nsubsequent lines of an indented or fenced code block:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3028,9 +3039,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "To see why, note that in\n" - "\n```markdown\n> foo\n> - bar\n```" -- "\n\n" -- "the `- bar` is indented too far to start a list, " -- "and can't\n" +- "\n\nthe `- bar`" +- " is indented too far to start a list, and can't\n" - "be an indented code block because indented code blocks cannot\n" - "interrupt paragraphs, so it is [paragraph continuation text].\n\n" - "A block quote can be empty:\n" @@ -3067,9 +3077,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "(Most current Markdown implementations, including John Gruber's\n" -- "original `Markdown.pl`, will parse this example as a " -- "single block quote\n" +- "(Most current Markdown implementations, including John Gruber's\noriginal " +- "`Markdown.pl`" +- ", will parse this example as a single block quote\n" - "with two paragraphs. But it seems better to allow the author to decide\n" - "whether two block quotes or one are wanted.)\n\n" - "Consecutiveness means that if we put these block quotes together,\n" @@ -3138,9 +3148,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    baz

    \n" - "````````````````" - "````````````````\n\n\n" -- "It is a consequence of the Laziness rule that any number\n" -- "of initial `>`" -- "s may be omitted on a continuation line of a\nnested block quote:\n\n" +- "It is a consequence of the Laziness rule that any number\nof initial " +- "`>`s may be omitted on a continuation line of a\n" +- "nested block quote:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3161,10 +3171,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "````````````````" - "````````````````\n\n\n" -- "When including an indented code block in a block quote,\n" -- "remember that the [block quote marker] includes\nboth the `>`" -- " and a following space of indentation. So *five spaces*" -- " are needed\nafter the `>`:\n\n" +- "When including an indented code block in a block quote,\nremember that the " +- "[block quote marker] includes\nboth the `>`" +- " and a following space of indentation. So *five spaces* are needed\n" +- "after the `>`:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3175,38 +3185,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## List items\n\n" -- "A [list marker](@) is a\n" -- "[bullet list marker] or an [ordered list marker].\n\n" -- "A [bullet list marker](@)\n" -- "is a `-`, `+`, or `*` character.\n\n" -- "An [ordered list marker](@)\n" -- "is a sequence of 1--9 arabic digits (`0-" -- "9`), followed by either a\n`.` character or a " -- "`)`" +- "A [list marker](@) is a\n[bullet list marker]" +- " or an [ordered list marker].\n\n" +- "A [bullet list marker](@)\nis a `-`, " +- "`+`, or `*` character.\n\n" +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`" +- "), followed by either a\n`.` character or a `)`" - " character. (The reason for the length\n" - "limit is that with 10 digits we start seeing integer overflows\n" - "in some browsers.)\n\nThe following rules define [list items]:\n\n" -- 1. **Basic case. -- "** If a sequence of lines *Ls* constitute a sequence of\n blocks " -- "*Bs* starting with a character other than a space or tab, " -- "and *M* is\n a list marker of width *W*" -- " followed by 1 ≤ *N*" +- "1. " +- "**Basic case.** If a sequence of lines *Ls*" +- " constitute a sequence of\n blocks *Bs*" +- " starting with a character other than a space or tab, and *M*" +- " is\n a list marker of width *W* followed by 1 ≤ *N*" - " ≤ 4 spaces of indentation,\n then the result of prepending " - "*M* and the following spaces to the first line\n of *Ls*" -- ", and indenting subsequent lines of *Ls* by *W + " -- "N* spaces, is a\n list item with *Bs*" +- ", and indenting subsequent lines of *Ls* by " +- "*W + N* spaces, is a\n list item with *Bs*" - " as its contents. The type of the list item\n " - "(bullet or ordered) is determined by the type of its list marker.\n " - "If the list item is ordered, then it is also assigned a start\n " - "number, based on the ordered list marker.\n\n Exceptions:\n\n " -- "1. When the first list item in a [list] interrupts\n" -- " a paragraph---that is, when it starts on a line that would\n " +- "1. When the first list item in a [list] interrupts\n a paragraph" +- "---that is, when it starts on a line that would\n " - "otherwise count as [paragraph continuation text]---then (a)\n " -- "the lines *Ls* must not begin with a blank line, and (" -- "b) if\n the list item is ordered, the start number must be 1.\n " -- "2. " -- "If any line is a [thematic break][thematic breaks] then\n " -- "that line is not a list item.\n\n" +- the lines *Ls* +- " must not begin with a blank line, and (b) if\n " +- "the list item is ordered, the start number must be 1.\n " +- "2. If any line is a [thematic break][thematic breaks" +- "] then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n" - "\n" - "````````````````" @@ -3220,8 +3229,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "And let *M* be the marker `1.`, and *N" -- "* = 2. Then rule #1 says\n" +- "And let *M* be the marker `1.`, and " +- "*N* = 2. Then rule #1 says\n" - "that the following is an ordered list item with start number 1,\n" - "and the same contents as *Ls*:\n\n" - "````````````````" @@ -3297,14 +3306,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- "Here `two` occurs in the same column as the list marker `1." -- "`,\nbut is actually contained in the list item, because there is\n" +- "Here `two` occurs in the same column as the list marker " +- "`1.`,\n" +- "but is actually contained in the list item, because there is\n" - "sufficient indentation after the last containing blockquote marker.\n\n" -- "The converse is also possible. " -- "In the following example, the word `two`" -- "\noccurs far to the right of the initial text of the list item, " -- "`one`" -- ", but\n" +- "The converse is also possible. In the following example, the word " +- "`two`\n" +- "occurs far to the right of the initial text of the list item, " +- "`one`, but\n" - "it is not considered part of the list item, because it is not " - "indented\nfar enough past the blockquote marker:\n\n" - "````````````````" @@ -3404,15 +3413,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    -1. not ok

    \n" - "````````````````" - "````````````````\n\n\n\n" -- 2. **Item starting with indented code. -- "** If a sequence of lines *Ls*\n constitute a sequence of blocks " +- 2. **Item starting with indented code.** +- " If a sequence of lines *Ls*\n constitute a sequence of blocks " - "*Bs* starting with an indented code\n block, and " -- "*M* is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending " +- "*M* is a list marker of width *W* followed by\n " +- "one space of indentation, then the result of prepending " - "*M* and the\n following space to the first line of *Ls*" - ", and indenting subsequent lines\n of *Ls* by " -- "*W + 1* spaces, is a list item with *Bs* " -- "as its contents.\n " +- "*W + 1* spaces, is a list item with *Bs*" +- " as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - "list item (bullet or ordered) is determined by the type of its list\n " @@ -3442,9 +3451,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "````````````````" - "````````````````\n\n\n" -- "If the *first* block in the list item is an indented code " -- "block,\nthen by rule #2, the contents must be preceded by " -- "*one* space of indentation\nafter the list marker:\n\n" +- If the *first* +- " block in the list item is an indented code block,\n" +- "then by rule #2, the contents must be preceded by *one*" +- " space of indentation\nafter the list marker:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3512,15 +3522,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n" -- 3. **Item starting with a blank line. -- "** If a sequence of lines *Ls*" -- "\n starting with a single [blank line] constitute a (possibly empty)\n " -- "sequence of blocks *Bs*, and *M* is a list marker " -- "of width *W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of " -- "*Ls* by *W + 1* spaces of indentation, " -- "is a\n list item with *Bs*" -- " as its contents.\n " +- 3. **Item starting with a blank line.** +- " If a sequence of lines *Ls*\n starting with a single [blank line" +- "] constitute a (possibly empty)\n sequence of blocks *Bs*, and " +- "*M* is a list marker of width *W*,\n " +- "then the result of prepending *M* to the first line of " +- "*Ls*, and\n preceding subsequent lines of *Ls* by " +- "*W + 1* spaces of indentation, is a\n list item with " +- "*Bs* as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - "list item (bullet or ordered) is determined by the type of its list\n " @@ -3550,8 +3559,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "A list item can begin with at most one blank line.\n" -- "In the following example, `foo`" -- " is not part of the list\nitem:\n\n" +- "In the following example, `foo` is not part of the list\n" +- "item:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -3610,9 +3619,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*

    \n

    foo\n1.

    \n" - "````````````````" - "````````````````\n\n\n" -- 4. **Indentation. -- "** If a sequence of lines *Ls*" -- " constitutes a list item\n " +- "4. **Indentation.** If a sequence of lines " +- "*Ls* constitutes a list item\n " - "according to rule #1, #2, or #3, then the result " - "of preceding each line\n of *Ls*" - " by up to three spaces of indentation (the same for each line) " @@ -3674,10 +3682,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n" - "````````````````" - "````````````````\n\n\n\n" -- 5. **Laziness. -- "** If a string of lines *Ls* constitute a " -- "[list\n item](#list-items) with contents *Bs*" -- ", then the result of deleting\n " +- "5. **Laziness.** If a string of lines " +- "*Ls* constitute a [list\n item](#list-items)" +- " with contents *Bs*, then the result of deleting\n " - "some or all of the indentation from one or more lines in which the\n " - "next character other than a space or tab after the indentation is\n [" - "paragraph continuation text] is a\n " @@ -3732,13 +3739,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````\n\n\n\n" -- "6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items)" -- ".\n\n" -- "The rules for sublists follow from the general rules\n" -- "[above][List items]" -- ". A sublist must be indented the same number\n" +- "6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items).\n\n" +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number\n" - "of spaces of indentation a paragraph would need to be in order to be " - "included\nin the list item.\n\n" - "So, in this case we need two spaces indent:\n" @@ -3821,29 +3826,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "### Motivation\n\n" -- "John Gruber's Markdown spec says the following about list items" -- ":\n\n" +- "John Gruber'" +- "s Markdown spec says the following about list items:\n\n" - "1. " -- "\"List markers typically start at the left margin, but may be indented\n" -- " by up to three spaces. List markers must be followed by one or more\n " +- "\"List markers typically start at the left margin, but may be indented\n " +- "by up to three spaces. List markers must be followed by one or more\n " - "spaces or a tab.\"\n\n" - "2. " - "\"To make lists look nice, you can wrap items with hanging indents" - "....\n But if you don't want to, you don'" - "t have to.\"\n\n" -- "3. \"List items may consist of multiple paragraphs. Each subsequent\n" -- " paragraph in a list item must be indented by either 4 spaces or one\n " +- "3. " +- "\"List items may consist of multiple paragraphs. Each subsequent\n " +- "paragraph in a list item must be indented by either 4 spaces or one\n " - "tab.\"\n\n" - "4. " -- "\"It looks nice if you indent every line of the subsequent paragraphs,\n" -- " but here again, Markdown will allow you to be lazy.\"\n\n" +- "\"It looks nice if you indent every line of the subsequent paragraphs,\n " +- "but here again, Markdown will allow you to be lazy.\"\n\n" - "5. " -- "\"To put a blockquote within a list item, the " -- "blockquote's `>`" -- "\n delimiters need to be indented.\"\n\n" +- "\"" +- "To put a blockquote within a list item, the " +- "blockquote's `>`\n " +- "delimiters need to be indented.\"\n\n" - "6. " -- "\"To put a code block within a list item, the code block needs to " -- "be\n indented twice — 8 spaces or two tabs.\"\n\n" +- "\"" +- "To put a code block within a list item, the code block needs to be\n " +- "indented twice — 8 spaces or two tabs.\"\n\n" - "These rules specify that a paragraph under a list item must be indented\n" - "four spaces (presumably, from the left margin, rather than the start of\n" - "the list marker, but this is not said), and that code under a " @@ -3852,14 +3860,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that a block quote must be indented, but not by how much; " - "however, the\nexample given has four spaces indentation. Although nothing is said\n" - "about other kinds of block-level content, it is certainly reasonable to\n" -- infer that *all* -- " block elements under a list item, including other\n" +- "infer that *all* block elements under a list item, including other\n" - "lists, must be indented four spaces. This principle has been called the\n" - "*four-space rule*.\n\n" -- "The four-space rule is clear and principled, and if the reference\n" -- "implementation `Markdown.pl`" -- " had followed it, it probably would have\nbecome the standard. However, " -- "`Markdown.pl`" +- "The four-space rule is clear and principled, and if the reference\nimplementation " +- "`Markdown.pl` had followed it, it probably would have\n" +- "become the standard. However, `Markdown.pl`" - " allowed paragraphs and\n" - "sublists to start with only two spaces indentation, at least on the\n" - "outer level. Worse, its behavior was inconsistent: a sublist of an\n" @@ -3868,17 +3874,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "implementations of Markdown have developed very different rules for\n" - "determining what comes under a list item. " - "(Pandoc and python-Markdown,\n" -- "for example, stuck with Gruber's syntax description and the four-" -- "space\n" +- "for example, stuck with Gruber'" +- "s syntax description and the four-space\n" - "rule, while discount, redcarpet, marked, PHP Markdown, " -- "and others\nfollowed `Markdown.pl`" -- "'s behavior more closely.)\n\n" +- "and others\nfollowed `Markdown.pl`'" +- "s behavior more closely.)\n\n" - "Unfortunately, given the divergences between implementations, there\n" - "is no way to give a spec for list items that will be guaranteed not\n" - "to break any existing documents. However, the spec given here should\n" - "correctly handle lists formatted with either the four-space rule or\n" -- "the more forgiving `Markdown.pl` behavior, provided they " -- "are laid out\nin a way that is natural for a human to read.\n\n" +- "the more forgiving `Markdown.pl`" +- " behavior, provided they are laid out\n" +- "in a way that is natural for a human to read.\n\n" - "The strategy here is to let the width and indentation of the list marker\n" - "determine the indentation necessary for blocks to fall under the list\n" - "item, rather than having a fixed and arbitrary number. The writer can\n" @@ -3893,14 +3900,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "unnatural. It is quite unintuitive that\n\n" - "``` markdown\n- foo\n\n bar\n\n - baz\n```" - "\n\nshould be parsed as two lists with an intervening paragraph,\n" -- "\n``` html\n
      \n" -- "
    • foo
    • \n
    \n" -- "

    bar

    \n
      \n" +- "\n``` html\n" +- "
        \n
      • foo
      • \n" +- "
      \n

      bar

      \n
        \n" - "
      • baz
      • \n
      \n" - "```\n\n" - "as the four-space rule demands, rather than a single list,\n" -- "\n``` html\n
        \n
      • \n" -- "

        foo

        \n" +- "\n``` html\n" +- "
          \n
        • \n

          foo

          \n" - "

          bar

          \n
            \n" - "
          • baz
          • \n
          \n" - "
        • \n
        \n```\n\n" @@ -3909,20 +3916,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Would it help to adopt a two-space rule? The problem is that such\n" - "a rule, together with the rule allowing up to three spaces of indentation " - "for\nthe initial list marker, allows text that is indented " -- "*less than*" -- " the\noriginal list marker to be included in the list item. For example,\n" +- "*less than* the\n" +- "original list marker to be included in the list item. For example,\n" - "`Markdown.pl` parses\n\n" - "``` markdown\n - one\n\n two\n```" - "\n\nas a single list item, with `two` a continuation paragraph:\n" -- "\n``` html\n
          \n
        • \n" -- "

          one

          \n

          two

          \n" -- "
        • \n
        \n```\n\nand similarly\n" -- "\n``` markdown\n> - one\n>\n> two\n```\n\nas\n" -- "\n``` html\n
        \n
          \n" -- "
        • \n

          one

          \n" +- "\n``` html\n" +- "
            \n
          • \n

            one

            \n" - "

            two

            \n
          • \n" -- "
          \n
        \n```\n\n" -- "This is extremely unintuitive.\n" +- "
      \n```\n\nand similarly\n" +- "\n``` markdown\n> - one\n>\n> two\n```\n\nas\n" +- "\n``` html\n" +- "
      \n
        \n
      • \n" +- "

        one

        \n

        two

        \n" +- "
      • \n
      \n
      \n" +- "```\n\nThis is extremely unintuitive.\n" - "\nRather than requiring a fixed indent from the margin, we could require\n" - "a fixed indent (say, two spaces, or even one space) from " - "the list marker (which\n" @@ -3933,18 +3941,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`bar`\nis not indented as far as the first paragraph " - "`foo`:\n\n" - "``` markdown\n 10. foo\n\n bar \n```" -- "\n\n" -- "Arguably this text does read like a list item with `bar` " -- "as a subparagraph,\n" +- "\n\nArguably this text does read like a list item with `bar`" +- " as a subparagraph,\n" - "which may count in favor of the proposal. " - "However, on this proposal indented\n" - "code would have to be indented six spaces after the list marker. " - "And this\nwould break a lot of existing Markdown, which has the pattern:\n\n" - "``` markdown\n1. foo\n\n indented code\n```" -- "\n\nwhere the code is indented eight spaces. " +- "\n\n" +- "where the code is indented eight spaces. " - "The spec above, by contrast, will\n" -- "parse this text as expected, since the code block's indentation " -- "is measured\nfrom the beginning of `foo`.\n\n" +- "parse this text as expected, since the code block'" +- "s indentation is measured\nfrom the beginning of `foo`.\n\n" - "The one case that needs special treatment is a list item that *starts*\n" - "with indented code. " - "How much indentation is required in that case, since\nwe don'" @@ -3956,29 +3964,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "four-space rule in cases where the list marker plus its initial indentation\n" - "takes four spaces (a common case), but diverge in other cases.\n\n" - "## Lists\n\n" -- "A [list](@) is a sequence of one or more\n" -- "list items [of the same type]. The list items\n" +- "A [list](@) is a sequence of one or more\nlist items " +- "[of the same type]. The list items\n" - "may be separated by any number of blank lines.\n\n" - "Two list items are [of the same type](@)\n" - "if they begin with a [list marker] of the same type.\n" - "Two list markers are of the\n" - "same type if (a) they are bullet list markers using the same character\n(" -- "`-`, `+`, or `*`) or (b" -- ") they are ordered list numbers with the same\ndelimiter (either " -- "`.` or `)`).\n\n" +- "`-`, `+`, or `*`" +- ") or (b) they are ordered list numbers with the same\n" +- "delimiter (either `.` or `)`).\n\n" - "A list is an [ordered list](@)\n" - "if its constituent list items begin with\n[ordered list markers], and a\n" -- "[bullet list](@)" -- " if its constituent list\nitems begin with [bullet list markers].\n\n" -- "The [start number](@)\n" -- "of an [ordered list] is determined by the list number of\n" +- "[bullet list](@) if its constituent list\nitems begin with [" +- "bullet list markers].\n\n" +- "The [start number](@)\nof an [ordered list]" +- " is determined by the list number of\n" - "its initial list item. The numbers of subsequent list items are\n" - "disregarded.\n\n" - "A list is [loose](@) if any of its constituent\n" - "list items are separated by blank lines, or if any of its constituent\n" - "list items directly contain two block-level elements with a blank line\n" -- "between them. Otherwise a list is [tight](@)" -- ".\n" +- "between them. Otherwise a list is [tight](@).\n" - "(The difference in HTML output is that paragraphs in a loose list " - "are\nwrapped in `

      `" - " tags, while paragraphs in a tight list are not.)\n\n" @@ -4015,32 +4022,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    • baz
    • \n
    \n" - "````````````````" - "````````````````\n\n" -- "`Markdown.pl` does not allow this, through fear of " -- "triggering a list\nvia a numeral in a hard-wrapped line:\n\n" -- "``` markdown\nThe number of windows in my house is\n14. " +- "`Markdown.pl`" +- " does not allow this, through fear of triggering a list\n" +- "via a numeral in a hard-wrapped line:\n\n" +- "``` markdown\n" +- "The number of windows in my house is\n14. " - "The number of doors is 6.\n```\n\n" -- "Oddly, though, `Markdown.pl` *does* allow " -- "a blockquote to\ninterrupt a paragraph, even though the same considerations might\n" -- "apply.\n\n" +- "Oddly, though, `Markdown.pl` *does*" +- " allow a blockquote to\n" +- "interrupt a paragraph, even though the same considerations might\napply.\n\n" - "In CommonMark, we do allow lists to interrupt paragraphs, for\n" - "two reasons. First, it is natural and not uncommon for people\n" - "to start lists without blank lines:\n\n" -- "``` markdown\nI need to buy\n- new shoes\n- a coat\n" -- "- a plane ticket\n```\n\nSecond, we are attracted to a\n\n" -- "> [principle of uniformity](@):\n" -- "> if a chunk of text has a certain\n> " +- "``` markdown\n" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket\n" +- "```\n\nSecond, we are attracted to a\n\n" +- "> " +- "[principle of uniformity](@):\n> " +- "if a chunk of text has a certain\n> " - "meaning, it will continue to have the same meaning when put into a\n> " - "container block (such as a list item or blockquote).\n\n" -- "(Indeed, the spec for [list items] and [block quotes] " -- "presupposes\nthis principle.) This principle implies that if\n\n" -- "``` markdown\n * I need to buy\n - new shoes\n - a coat\n" -- " - a plane ticket\n```\n\n" +- "(Indeed, the spec for [list items] and [block quotes]" +- " presupposes\nthis principle.) This principle implies that if\n\n" +- "``` markdown\n" +- " * I need to buy\n - new shoes\n - a coat\n - a plane ticket\n" +- "```\n\n" - "is a list item containing a paragraph followed by a nested sublist,\n" - "as all Markdown implementations agree it is (though the paragraph\n" - "may be rendered without `

    ` tags, since the list is \"" - "tight\"),\nthen\n\n" -- "``` markdown\nI need to buy\n- new shoes\n- a coat\n" -- "- a plane ticket\n```\n\n" +- "``` markdown\n" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket\n" +- "```\n\n" - "by itself should be a paragraph followed by a nested sublist.\n" - "\nSince it is well established Markdown practice to allow lists to\n" - "interrupt paragraphs inside list items, the [principle of\nuniformity]" @@ -4050,8 +4063,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "takes a different approach, requiring blank lines before lists\n" - "even inside other list items.)\n\n" - "In order to solve the problem of unwanted lists in paragraphs with\n" -- "hard-wrapped numerals, we allow only lists starting with `1` " -- "to\ninterrupt paragraphs. Thus,\n\n" +- "hard-wrapped numerals, we allow only lists starting with `1`" +- " to\ninterrupt paragraphs. Thus,\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4168,9 +4181,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- e\n\n" - "````````````````" - "````````````````\n\n" -- "And here, `3. c` is treated as in indented code " -- "block,\nbecause it is indented four spaces and preceded by a\n" -- "blank line.\n\n" +- "And here, `3. c`" +- " is treated as in indented code block,\n" +- "because it is indented four spaces and preceded by a\nblank line.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4347,18 +4360,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">\n" - "````````````````" - "````````````````\n\n" -- "`hi` is parsed as code, leaving the backtick at the end " -- "as a literal\nbacktick.\n\n\n\n" +- "`hi`" +- " is parsed as code, leaving the backtick at the end as a literal\n" +- "backtick.\n\n\n\n" - "## Code spans\n\n" - "A [backtick string](@)\n" - "is a string of one or more backtick characters (`` ` ``" - ") that is neither\npreceded nor followed by a backtick.\n\n" -- "A [code span](@) begins with a backtick string and ends " -- "with\na backtick string of equal length. The contents of the code span are\n" +- "A [code span](@)" +- " begins with a backtick string and ends with\n" +- "a backtick string of equal length. The contents of the code span are\n" - "the characters between these two backtick strings, normalized in the\nfollowing ways:\n\n" - "- First, [line endings] are converted to [spaces].\n" -- "- If the resulting string both begins *and* ends with a [space]\n" -- " character, but does not consist entirely of [space]\n characters, a single [" +- "- If the resulting string both begins *and* ends with a [space]\n " +- "character, but does not consist entirely of [space]\n characters, a single [" - "space] character is removed from the\n " - "front and back. This allows you to include code that begins\n " - "or ends with backtick characters, which must be separated by\n " @@ -4409,8 +4424,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a

    \n" - "````````````````" - "````````````````\n\n" -- "Only [spaces], and not [unicode whitespace] in general" -- ", are\nstripped in this way:\n\n" +- "Only [spaces], and not [unicode whitespace]" +- " in general, are\nstripped in this way:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4454,8 +4469,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/p>\n" - "````````````````" - "````````````````\n\n" -- "Note that browsers will typically collapse consecutive spaces\n" -- "when rendering ``" +- "Note that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:\n\n " - "code{white-space: pre-wrap;}\n" - "\n\nNote that backslash escapes do not work in code spans. All backslashes\n" @@ -4468,9 +4482,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/p>\n" - "````````````````" - "````````````````\n\n\n" -- "Backslash escapes are never needed, because one can always choose a\n" -- "string of *n* backtick characters as delimiters, where the " -- "code does\nnot contain any strings of exactly *n* backtick characters.\n\n" +- "Backslash escapes are never needed, because one can always choose a\nstring of " +- "*n* backtick characters as delimiters, where the code does\n" +- "not contain any strings of exactly *n* backtick characters.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4490,8 +4504,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Code span backticks have higher precedence than any other inline\n" - "constructs except HTML tags and autolinks. " - "Thus, for example, this is\n" -- "not parsed as emphasized text, since the second `*` is part of " -- "a code\nspan:\n\n" +- "not parsed as emphasized text, since the second `*`" +- " is part of a code\nspan:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4580,23 +4594,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Emphasis and strong emphasis\n\n" -- "John Gruber's original [Markdown syntax\n" -- "description](https://daringfireball.net/projects/" -- "markdown/syntax#em) says:\n\n" -- "> Markdown treats asterisks (`*`) and " -- "underscores (`_`" -- ") as indicators of\n> emphasis. Text wrapped with one `*` or " -- "`_` will be wrapped with an HTML\n> " -- "`` tag; double `*`'s or `_" -- "`'s will be wrapped with an HTML ``" -- "\n> tag.\n\n" +- "John Gruber's original " +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown" +- "/syntax#em) says:\n\n" +- "> " +- "Markdown treats asterisks (`*`" +- ") and underscores (`_`) as indicators of\n> " +- "emphasis. Text wrapped with one `*` or `_`" +- " will be wrapped with an HTML\n> ``" +- " tag; double `*`'s or `_`'" +- "s will be wrapped with an HTML ``\n> " +- "tag.\n\n" - "This is enough for most users, but these rules leave much undecided,\n" - "especially when it comes to nested emphasis. The original\n" -- "`Markdown.pl` test suite makes it clear that triple `*" -- "**` and\n`___`" +- "`Markdown.pl` test suite makes it clear that triple " +- "`***` and\n`___`" - " delimiters can be used for strong emphasis, and most\n" - "implementations have also allowed the following patterns:\n\n" -- "``` markdown\n***strong emph***\n" +- "``` markdown\n" +- "***strong emph***\n" - "***strong** in emph*\n" - "***emph* in strong**\n" - "**in strong *emph***\n" @@ -4604,55 +4621,57 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The following patterns are less widely supported, but the intent\n" - "is clear and they are useful (especially in contexts like bibliography\n" - "entries):\n\n" -- "``` markdown\n*emph *with emph* in it*\n" +- "``` markdown\n" +- "*emph *with emph* in it*\n" - "**strong **with strong** in it**\n```\n\n" -- "Many implementations have also restricted intraword emphasis to\n" -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - " forms, to avoid unwanted emphasis in words containing\n" - "internal underscores. (It is best practice to put these in code\n" - "spans, but users often do not.)\n\n" -- "``` markdown\ninternal emphasis: foo*bar*baz\n" +- "``` markdown\n" +- "internal emphasis: foo*bar*baz\n" - "no emphasis: foo_bar_baz\n```\n\n" - "The rules given below capture all of these patterns, while allowing\n" - "for efficient parsing strategies that do not backtrack.\n\n" -- "First, some definitions. " -- "A [delimiter run](@)" +- "First, some definitions. A [delimiter run](@)" - " is either\na sequence of one or more `*`" - " characters that is not preceded or\nfollowed by a non-backslash-escaped " - "`*` character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped " - "`_` character.\n\n" -- "A [left-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not followed by [" +- "A [left-flanking delimiter run](@) is\na " +- "[delimiter run] that is (1) not followed by [" - "Unicode whitespace],\n" -- "and either (2a) not followed by a [Unicode " -- "punctuation character], or\n(2b) followed by a [" -- "Unicode punctuation character] and\npreceded by [" -- "Unicode whitespace] or a [Unicode punctuation " -- "character].\nFor purposes of this definition, the beginning and the end of\n" +- "and either (2a) not followed by a [" +- "Unicode punctuation character], or\n" +- "(2b) followed by a [Unicode punctuation character" +- "] and\npreceded by [Unicode whitespace] or a [" +- "Unicode punctuation character].\n" +- "For purposes of this definition, the beginning and the end of\n" - "the line count as Unicode whitespace.\n\n" -- "A [right-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not preceded by [" +- "A [right-flanking delimiter run](@) is\na " +- "[delimiter run] that is (1) not preceded by [" - "Unicode whitespace],\n" -- "and either (2a) not preceded by a [Unicode " -- "punctuation character], or\n(2b) preceded by a [" -- "Unicode punctuation character] and\nfollowed by [" -- "Unicode whitespace] or a [Unicode punctuation " -- "character].\nFor purposes of this definition, the beginning and the end of\n" +- "and either (2a) not preceded by a [" +- "Unicode punctuation character], or\n" +- "(2b) preceded by a [Unicode punctuation character" +- "] and\nfollowed by [Unicode whitespace] or a [" +- "Unicode punctuation character].\n" +- "For purposes of this definition, the beginning and the end of\n" - "the line count as Unicode whitespace.\n\n" - "Here are some examples of delimiter runs.\n\n" -- " - left-flanking but not right-flanking:\n\n ```\n" -- " ***abc\n _abc\n " +- " - left-flanking but not right-flanking:\n" +- "\n ```\n ***abc\n _abc\n " - "**\"abc\"\n _\"abc\"\n ```\n\n" -- " - right-flanking but not left-flanking:\n\n ```\n" -- " abc***\n abc_\n " +- " - right-flanking but not left-flanking:\n" +- "\n ```\n abc***\n abc_\n " - "\"abc\"**\n \"abc\"_\n ```\n\n" -- " - Both left and right-flanking:\n\n ```\n" -- " abc***def\n \"abc\"_\"def\"\n" -- " ```\n\n" -- " - Neither left nor right-flanking:\n\n ```\n" -- " abc *** def\n a _ b\n ```\n\n" -- "(The idea of distinguishing left-flanking and right-flanking\n" +- " - Both left and right-flanking:\n" +- "\n ```\n abc***def\n " +- "\"abc\"_\"def\"\n ```\n\n" +- " - Neither left nor right-flanking:\n" +- "\n ```\n abc *** def\n a _ b\n ```" +- "\n\n(The idea of distinguishing left-flanking and right-flanking\n" - "delimiter runs based on the character before and the character\n" - "after comes from Roopesh Chander's\n" - "[vfmd](https://web.archive.org" @@ -4664,47 +4683,53 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " and its rules for distinguishing left- and right-flanking runs\n" - "are a bit more complex than the ones given here.)\n\n" - "The following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n" -- " iff (if and only if) it is part of a [left-" -- "flanking delimiter run].\n\n" -- "2. A single `_` character [can open emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " -- "and either (a) not part of a [right-flanking " -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "1. " +- "A single `*` character [can open emphasis](@)\n " +- "iff (if and only if) it is part of a [" +- "left-flanking delimiter run].\n\n" +- "2. " +- "A single `_` character [can open emphasis] iff\n " +- "it is part of a [left-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a " +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character].\n\n" -- "3. A single `*` character [can close emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run" +- "3. " +- "A single `*` character [can close emphasis](@)\n " +- "iff it is part of a [right-flanking delimiter run" - "].\n\n" -- "4. A single `_` character [can close emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " -- "and either (a) not part of a [left-flanking " -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" +- "4. " +- "A single `_` character [can close emphasis] iff\n " +- "it is part of a [right-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a " +- "[left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character].\n\n" - "5. " -- "A double `**` [can open strong emphasis](@)\n" -- " iff it is part of a [left-flanking delimiter run" +- "A double `**` [can open strong emphasis](@)\n " +- "iff it is part of a [left-flanking delimiter run" - "].\n\n" -- "6. A double `__` [can open strong emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " -- "and either (a) not part of a [right-flanking " -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "6. " +- "A double `__` [can open strong emphasis] iff\n " +- "it is part of a [left-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a " +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character].\n\n" - "7. " -- "A double `**` [can close strong emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run" +- "A double `**` [can close strong emphasis](@)\n " +- "iff it is part of a [right-flanking delimiter run" - "].\n\n" -- "8. A double `__` [can close strong emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " -- "and either (a) not part of a [left-flanking " -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" +- "8. " +- "A double `__` [can close strong emphasis] iff\n " +- "it is part of a [right-flanking delimiter run]\n " +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a " +- "[left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character].\n\n" - "9. " -- "Emphasis begins with a delimiter that [can open emphasis] and " -- "ends\n with a delimiter that [can close emphasis]" +- "Emphasis begins with a delimiter that [can open emphasis]" +- " and ends\n with a delimiter that [can close emphasis]" - ", and that uses the same\n character (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs" @@ -4712,10 +4737,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "open and close emphasis, then the sum of the lengths of the\n " - "delimiter runs containing the opening and closing delimiters\n " - "must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n" -- "10. Strong emphasis begins with a delimiter that\n" -- " [can open strong emphasis] and ends with a delimiter that\n [" -- "can close strong emphasis], and that uses the same character\n (`_`" -- " or `*`" +- "10. " +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis]" +- " and ends with a delimiter that\n [can close strong emphasis]" +- ", and that uses the same character\n (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs" - "]. If one of the delimiters can both open\n " @@ -4723,37 +4748,45 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the delimiter runs containing the opening and closing\n " - "delimiters must not be a multiple of 3 unless both lengths\n " - "are multiples of 3.\n\n" -- "11. A literal `*` character cannot occur at the beginning or end of\n" -- " `*`-delimited emphasis or `**`-" -- "delimited strong emphasis, unless it\n is backslash-escaped.\n\n" -- "12. A literal `_` character cannot occur at the beginning or end of\n" -- " `_`-delimited emphasis or `__`-" -- "delimited strong emphasis, unless it\n is backslash-escaped.\n\n" +- "11. " +- "A literal `*` character cannot occur at the beginning or end of\n " +- "`*`-delimited emphasis or `**`" +- "-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" +- "12. " +- "A literal `_` character cannot occur at the beginning or end of\n " +- "`_`-delimited emphasis or `__`" +- "-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "Where rules 1--12 above are compatible with multiple parsings,\n" - "the following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. " -- "Thus, for example,\n an interpretation " -- "`...` is always preferred to\n " +- "13. " +- "The number of nestings should be minimized. Thus, for example,\n " +- "an interpretation `...`" +- " is always preferred to\n " - "`...`.\n\n" - "14. " -- "An interpretation `..." -- "` is always\n preferred to " +- "An interpretation " +- "`...` is always\n preferred to " - "`...`.\n\n" -- "15. When two potential emphasis or strong emphasis spans overlap,\n" -- " so that the second begins before the first ends and ends after\n " +- "15. " +- "When two potential emphasis or strong emphasis spans overlap,\n " +- "so that the second begins before the first ends and ends after\n " - "the first ends, the first takes precedence. Thus, for example,\n " -- "`*foo _bar* baz_` is parsed as `" -- "foo _bar baz_` rather\n " -- "than `*foo bar* baz" -- "`.\n\n" -- "16. When there are two potential emphasis or strong emphasis spans\n" -- " with the same closing delimiter, the shorter one (the one that\n " +- "`*foo _bar* baz_` is parsed as " +- "`foo _bar baz_`" +- " rather\n than " +- "`*foo bar* baz`" +- ".\n\n" +- "16. " +- "When there are two potential emphasis or strong emphasis spans\n " +- "with the same closing delimiter, the shorter one (the one that\n " - "opens later) takes precedence. Thus, for example,\n " -- "`**foo **bar baz**` is parsed " -- "as `**foo bar baz" -- "`\n rather than " +- "`**foo **bar baz**`" +- " is parsed as " +- "`**foo bar baz`\n " +- "rather than " - "`foo **bar baz`" - ".\n\n" - "17. " @@ -4776,8 +4809,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "This is not emphasis, because the opening `*` is followed by\n" -- "whitespace, and hence not part of a [left-flanking " -- "delimiter run]:\n\n" +- "whitespace, and hence not part of a [" +- "left-flanking delimiter run]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -4938,8 +4971,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````\n\n\n" - "This is not emphasis, because the second `*` is\n" - "preceded by punctuation and followed by an alphanumeric\n" -- "(hence it is not part of a [right-flanking delimiter " -- "run]:\n\n" +- "(hence it is not part of a [" +- "right-flanking delimiter run]:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5402,9 +5435,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \n" - "````````````````" - "````````````````\n\n\n" -- "When the lengths of the interior closing and opening\n" -- delimiter runs are *both* -- " multiples of 3, though,\nthey can match to create emphasis:\n\n" +- "When the lengths of the interior closing and opening\ndelimiter runs are " +- "*both* multiples of 3, though,\n" +- "they can match to create emphasis:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5634,8 +5667,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Note that when delimiters do not match evenly, Rule 11 determines\n" -- "that the excess literal `*`" -- " characters will appear outside of the\nemphasis, rather than inside it:\n\n" +- "that the excess literal `*` characters will appear outside of the\n" +- "emphasis, rather than inside it:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5741,8 +5774,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "Note that when delimiters do not match evenly, Rule 12 determines\n" -- "that the excess literal `_`" -- " characters will appear outside of the\nemphasis, rather than inside it:\n\n" +- "that the excess literal `_` characters will appear outside of the\n" +- "emphasis, rather than inside it:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -5980,73 +6013,81 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n\n" - "## Links\n\n" -- "A link contains [link text] (the visible text), a [link " -- "destination]\n" +- "A link contains [link text] (the visible text), a [" +- "link destination]\n" - "(the URI that is the link destination), and optionally a [" - "link title].\nThere are two basic kinds of links in Markdown. In " - "[inline links] the\n" - "destination and title are given immediately after the link text. In\n[reference links]" - " the destination and title are defined elsewhere in\nthe document.\n\n" - "A [link text](@) consists of a sequence of zero or more\n" -- "inline elements enclosed by square brackets (`[` and `]`)" -- ". The\nfollowing rules apply:\n\n" -- "- Links may not contain other links, at any level of nesting. If\n" -- " multiple otherwise valid link definitions appear nested inside each\n " +- "inline elements enclosed by square brackets (`[` and `]`" +- "). The\nfollowing rules apply:\n\n" +- "- " +- "Links may not contain other links, at any level of nesting. If\n " +- "multiple otherwise valid link definitions appear nested inside each\n " - "other, the inner-most definition is used.\n\n" -- "- Brackets are allowed in the [link text] only if (a" -- ") they\n " +- "- " +- "Brackets are allowed in the [link text]" +- " only if (a) they\n " - "are backslash-escaped or (b) they appear as a matched pair of " - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket " - "`]`.\n\n" -- "- Backtick [code spans], [autolinks], and raw " -- "[HTML tags] bind more tightly\n " +- "- " +- "Backtick [code spans], [autolinks], and raw [" +- "HTML tags] bind more tightly\n " - "than the brackets in link text. Thus, for example,\n " -- "`` [foo`]` `` could not be a link text" -- ", since the second `]`\n is part of a code span.\n\n" -- "- The brackets in link text bind more tightly than markers for\n" -- " [emphasis and strong emphasis]. Thus, for example, " +- "`` [foo`]` ``" +- " could not be a link text, since the second `]`\n " +- "is part of a code span.\n\n" +- "- " +- "The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis" +- "]. Thus, for example, " - "`*[foo*](url)` is a link.\n\n" - "A [link destination](@) consists of either\n\n" -- "- a sequence of zero or more characters between an opening `<` and a\n" -- " closing `>` that contains no line endings or unescaped\n `<`" -- " or `>` characters, or\n\n" -- "- a nonempty sequence of characters that does not start with `<`" -- ",\n does not include [ASCII control characters][" -- "ASCII control character]\n or [space]" +- "- " +- "a sequence of zero or more characters between an opening `<` and a\n closing " +- "`>` that contains no line endings or unescaped\n `<` or " +- "`>` characters, or\n\n" +- "- " +- "a nonempty sequence of characters that does not start with `<`,\n " +- "does not include [ASCII control characters][ASCII control character" +- "]\n or [space]" - " character, and includes parentheses only if (a) they are\n " - "backslash-escaped or (b) they are part of a balanced pair of\n " - "unescaped parentheses.\n " - "(Implementations may impose limits on parentheses nesting to\n " - "avoid performance issues, but at least three levels of nesting\n should be supported.)\n\n" - "A [link title](@) consists of either\n\n" -- "- a sequence of zero or more characters between straight double-quote\n" -- " characters (`\"`), including a `\"` character only if it " -- "is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between straight single-quote\n" -- " characters (`'`), including a `'` character only if it " -- "is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between matching parentheses\n" -- " (`(...)`), including a `(` or " +- "- " +- "a sequence of zero or more characters between straight double-quote\n characters (" +- "`\"`), including a `\"` character only if it is\n " +- "backslash-escaped, or\n\n" +- "- " +- "a sequence of zero or more characters between straight single-quote\n characters (" +- "`'`), including a `'` character only if it is\n " +- "backslash-escaped, or\n\n" +- "- " +- "a sequence of zero or more characters between matching parentheses\n (" +- "`(...)`), including a `(` or " - "`)` character only if it is\n backslash-escaped.\n\n" -- "Although [link titles] may span multiple lines, they may not contain\n" -- "a [blank line].\n\n" -- "An [inline link](@) consists of a [link text] " -- "followed immediately\nby a left parenthesis `(`" -- ", an optional [link destination], an optional\n[link title]" -- ", and a right parenthesis `)`" -- ".\n" +- "Although [link titles] may span multiple lines, they may not contain\na [" +- "blank line].\n\n" +- "An [inline link](@) consists of a [link text]" +- " followed immediately\nby a left parenthesis `(`, an optional [link destination" +- "], an optional\n[link title], and a right parenthesis " +- "`)`.\n" - "These four components may be separated by spaces, tabs, and up to one " - "line\nending.\nIf both [link destination] and [link title]" -- " are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending.\n\n" -- "The link's text consists of the inlines contained\n" -- "in the [link text] (excluding the enclosing square brackets)" -- ".\nThe link'" +- " are present, they *must* be\n" +- "separated by spaces, tabs, and up to one line ending.\n\n" +- "The link's text consists of the inlines contained\nin the [link text" +- "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n" -- "`<...>` if present, with backslash-escapes in " -- "effect as described\nabove. The link'" -- "s title consists of the link title, excluding its\n" +- "`<...>`" +- " if present, with backslash-escapes in effect as described\nabove. The link" +- "'s title consists of the link title, excluding its\n" - "enclosing delimiters, with backslash-escapes in effect " - "as described\nabove.\n\nHere is a simple inline link:\n" - "\n" @@ -6341,20 +6382,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "link

    \n" - "````````````````" - "````````````````\n\n\n" -- "(Note: `Markdown.pl` did allow double quotes inside a " -- "double-quoted\ntitle, and its test suite included a test demonstrating this.\n" +- "(Note: `Markdown.pl`" +- " did allow double quotes inside a double-quoted\n" +- "title, and its test suite included a test demonstrating this.\n" - "But it is hard to see a good rationale for the extra complexity this\n" - "brings, since there are already many ways---backslash escaping,\n" - "entity and numeric character references, or using a different\n" - "quote type for the enclosing title---to write titles containing\n" -- "double quotes. `Markdown.pl`" -- "'s handling of titles has a number\n" +- "double quotes. `Markdown.pl`'" +- "s handling of titles has a number\n" - "of other strange features. For example, it allows single-quoted\n" - "titles in inline links, but not reference links. And, in\n" - "reference links but not inline links, it allows a title to begin\nwith " - "`\"` and end with `)`. " -- "`Markdown.pl`" -- " 1.0.1 even allows\n" +- "`Markdown.pl` 1.0.1 even allows\n" - "titles with no closing quotation mark, though 1.0.2b8 " - "does not.\nIt seems preferable to adopt a simple, rational rule that works\n" - "the same way in inline links and link reference definitions.)\n\n" @@ -6528,14 +6569,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "There are three kinds of [reference link](@)s:\n" -- "[full](#full-reference-link), [collapsed](" -- "#collapsed-reference-link),\nand " +- "[full](#full-reference-link), " +- "[collapsed](#collapsed-reference-link),\nand " - "[shortcut](#shortcut-reference-link).\n\n" -- "A [full reference link](@)\n" -- "consists of a [link text] immediately followed by a [link label]\nthat " -- "[matches] a [link reference definition] elsewhere in the document.\n\n" -- "A [link label](@) begins with a left bracket (`[" -- "`) and ends\nwith the first right bracket (`]`" +- "A [full reference link](@)\nconsists of a [link text]" +- " immediately followed by a [link label]\nthat [matches] a [" +- "link reference definition] elsewhere in the document.\n\n" +- "A [link label](@) begins with a left bracket (" +- "`[`) and ends\nwith the first right bracket (`]`" - ") that is not backslash-escaped.\n" - "Between these brackets there must be at least one character that is not a space,\n" - "tab, or line ending.\n" @@ -6545,8 +6586,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One label [matches](@)\n" - "another just in case their normalized forms are equal. To normalize a\n" - "label, strip off the opening and closing brackets,\nperform the " -- "*Unicode case fold*" -- ", strip leading and trailing\n" +- "*Unicode case fold*, strip leading and trailing\n" - "spaces, tabs, and line endings, and collapse consecutive internal\n" - "spaces, tabs, and line endings to a single space. " - "If there are multiple\nmatching reference link definitions, the one that comes first in the\n" @@ -6564,8 +6604,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo

    \n" - "````````````````" - "````````````````\n\n\n" -- "The rules for the [link text] are the same as with\n" -- "[inline links]. Thus:\n\n" +- "The rules for the [link text] are the same as with\n[" +- "inline links]. Thus:\n\n" - "The link text may contain balanced brackets, but not unbalanced ones,\n" - "unless they are escaped:\n\n" - "````````````````" @@ -6751,12 +6791,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]. If whitespace is allowed between the\n" - "link text and the link label, then in the following we will have\n" - "a single reference link, not two shortcut reference links, as\nintended:\n\n" -- "``` markdown\n[foo]\n[bar]\n\n" -- "[foo]: /url1\n" +- "``` markdown\n" +- "[foo]\n[bar]\n\n[foo]: /url1\n" - "[bar]: /url2\n```\n\n" - "(Note that [shortcut reference links] were introduced by Gruber\n" -- "himself in a beta version of `Markdown.pl`, but never " -- "included\nin the official syntax description. Without shortcut reference\n" +- "himself in a beta version of `Markdown.pl`" +- ", but never included\nin the official syntax description. Without shortcut reference\n" - "links, it is harmless to allow space between the link text and\n" - "link label; but once shortcut references are introduced, it is\n" - "too dangerous to allow this, as it frequently leads to\n" @@ -6835,8 +6875,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\\

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:\n\n" +- "A [link label]" +- " must contain at least one character that is not a space, tab, or\n" +- "line ending:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -6853,14 +6894,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]: /uri

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [collapsed reference link](@)\n" -- "consists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument, followed by the string `[]`" -- ".\nThe contents of the link label are parsed as inlines,\n" +- "A [collapsed reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the\n" +- "document, followed by the string `[]`.\n" +- "The contents of the link label are parsed as inlines,\n" - "which are used as the link's text. The link'" - "s URI and title are\nprovided by the matching reference link definition. Thus,\n" -- "`[foo][]` is equivalent to `[foo]" -- "[foo]`.\n\n" +- "`[foo][]` is equivalent to " +- "`[foo][foo]`.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -6902,15 +6943,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo\n[]

    \n" - "````````````````" - "````````````````\n\n\n" -- "A [shortcut reference link](@)\n" -- "consists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument and is not followed by `[]`" -- " or a link label.\n" +- "A [shortcut reference link](@)\nconsists of a [link label" +- "] that [matches] a\n[link reference definition] elsewhere in the\n" +- "document and is not followed by `[]` or a link label.\n" - "The contents of the link label are parsed as inlines,\n" - "which are used as the link's text. The link'" - "s URI and title\nare provided by the matching link reference definition.\nThus, " -- "`[foo]` is equivalent to `[foo][]" -- "`.\n\n" +- "`[foo]` is equivalent to " +- "`[foo][]`.\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7024,8 +7064,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo(not a link)

    \n" - "````````````````" - "````````````````\n\n" -- "In the following case `[bar][baz]` is parsed " -- "as a reference,\n`[foo]` as normal text:\n\n" +- "In the following case `[bar][baz]`" +- " is parsed as a reference,\n`[foo]`" +- " as normal text:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7035,8 +7076,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url\">bar

    \n" - "````````````````" - "````````````````\n\n\n" -- "Here, though, `[foo][bar]` is parsed " -- "as a reference, since\n`[bar]` is defined:\n\n" +- "Here, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7048,9 +7089,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url1\">baz

    \n" - "````````````````" - "````````````````\n\n\n" -- "Here `[foo]` is not parsed as a shortcut reference" -- ", because it\nis followed by a link label (even though " -- "`[bar]` is not defined):\n\n" +- "Here `[foo]`" +- " is not parsed as a shortcut reference, because it\n" +- "is followed by a link label (even though `[bar]`" +- " is not defined):\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7064,11 +7106,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Images\n\n" - "Syntax for images is like the syntax for links, with one\n" - "difference. Instead of [link text], we have an\n" -- "[image description](@)" -- ". The rules for this are the\nsame as for [link text]" -- ", except that (a) an\nimage description starts with `![`" -- " rather than `[`" -- ", and\n(b) an image description may contain links.\n" +- "[image description](@). The rules for this are the\n" +- "same as for [link text], except that (a) an\n" +- "image description starts with `![` rather than `[`, and\n" +- "(b) an image description may contain links.\n" - "An image description has inline elements\n" - "as its contents. When an image is rendered to HTML,\n" - "this is standardly used as the image's `alt` attribute.\n\n" @@ -7113,8 +7154,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Though this spec is concerned with parsing, not rendering, it is\n" - "recommended that in rendering to HTML, only the plain string content\nof the " - "[image description] be used. Note that in\n" -- "the above example, the alt attribute's value is `foo bar" -- "`, not `foo\n[bar](/url)` or " +- "the above example, the alt attribute's value is " +- "`foo bar`, not " +- "`foo\n[bar](/url)` or " - "`foo bar" - "`. Only the plain string\n" - "content is rendered, without formatting.\n\n" @@ -7282,8 +7324,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "=\"Foo\" title=\"title\" />

    \n" - "````````````````" - "````````````````\n\n\n" -- "If you just want a literal `!" -- "` followed by bracketed text, you can\nbackslash-escape the opening " +- "If you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening " - "`[`:\n\n" - "````````````````" - "```````````````` " @@ -7293,8 +7335,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ![foo]

    \n" - "````````````````" - "````````````````\n\n\n" -- "If you want a link after a literal `!" -- "`, backslash-escape the\n`!`:\n\n" +- "If you want a link after a literal `!`" +- ", backslash-escape the\n`!`:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7305,28 +7347,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Autolinks\n\n" -- "[Autolink](@)s are absolute URIs and email addresses " -- "inside\n`<` and `>`" +- "[Autolink](@)" +- "s are absolute URIs and email addresses inside\n`<` and " +- "`>`" - ". They are parsed as links, with the URL or email address\n" - "as the link label.\n\n" -- "A [URI autolink](@) consists of `<`, " -- "followed by an\n[absolute URI] followed by `>`" +- "A [URI autolink](@) consists of `<`" +- ", followed by an\n[absolute URI] followed by `>`" - ". It is parsed as\n" -- "a link to the URI, with the URI as the link's " -- "label.\n\n" +- "a link to the URI, with the URI as the link'" +- "s label.\n\n" - "An [absolute URI](@),\n" - "for these purposes, consists of a [scheme] followed by a colon (" -- "`:`" -- ")\nfollowed by zero or more characters other than [ASCII control\ncharacters]" -- "[ASCII control character], [space], `<`, " -- "and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded\n" +- "`:`)\nfollowed by zero or more characters other than [" +- "ASCII control\ncharacters][ASCII control character], [space" +- "], `<`, and `>`.\n" +- "If the URI includes these characters, they must be percent-encoded\n" - "(e.g. `%20` for a space).\n\n" -- "For purposes of this spec, a [scheme](@) is any " -- "sequence\nof 2--32 characters beginning with an ASCII letter and followed\n" +- "For purposes of this spec, a [scheme](@)" +- " is any sequence\nof 2--" +- "32 characters beginning with an ASCII letter and followed\n" - "by any combination of ASCII letters, digits, or the symbols plus\n(" -- "\"+\"), period (\".\"), or " -- "hyphen (\"-\").\n\n" +- "\"+\"), period (\".\"" +- "), or hyphen (\"-\").\n\n" - "Here are some valid autolinks:\n" - "\n" - "````````````````" @@ -7436,18 +7479,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "p>\n" - "````````````````" - "````````````````\n\n\n" -- "An [email autolink](@)\n" -- "consists of `<`, followed by an [email address],\nfollowed by " -- "`>`" +- "An [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is " - "`mailto:` followed by the email address.\n\n" - "An [email address](@),\n" - "for these purposes, is anything that matches\nthe " -- "[non-normative regex from the HTML5\n" -- "spec](https://" -- html.spec.whatwg.org/multipage/ -- "forms.html#e-mail-state-(type=email))" -- ":\n\n " +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org" +- "/multipage/forms.html#e-mail-state-(type" +- "=email)):\n\n " - "/^[a-zA-Z0-9.!" - "#$%&'*+/=?" - "^_`{|}~-]+@[a-zA" @@ -7537,8 +7578,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````\n\n\n" - "## Raw HTML\n\n" -- "Text between `<` and `>` that looks like an HTML " -- "tag is parsed as a\n" +- "Text between `<` and `>`" +- " that looks like an HTML tag is parsed as a\n" - "raw HTML tag and will be rendered in HTML without escaping.\n" - "Tag and attribute names are not limited to current HTML tags,\n" - "so custom tags (and even, say, DocBook tags) may be " @@ -7546,11 +7587,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nA [tag name](@) consists of an ASCII letter\n" - "followed by zero or more ASCII letters, digits, or\n" - "hyphens (`-`).\n\n" -- "An [attribute](@) consists of spaces, tabs, and up " -- "to one line ending,\nan [attribute name], and an optional\n[" -- "attribute value specification].\n\n" -- "An [attribute name](@)\n" -- "consists of an ASCII letter, `_`, or `:`" +- "An [attribute](@)" +- " consists of spaces, tabs, and up to one line ending,\nan [" +- "attribute name], and an optional\n[attribute value specification].\n\n" +- "An [attribute name](@)\nconsists of an ASCII letter, " +- "`_`, or `:`" - ", followed by zero or more ASCII\nletters, digits, `_`" - ", `.`, `:`, or `-`" - ". (Note: This is the XML\n" @@ -7558,46 +7599,47 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "HTML5 is laxer.)\n\n" - "An [attribute value specification](@)\n" - "consists of optional spaces, tabs, and up to one line ending,\na " -- "`=` character, optional spaces, tabs, and up to one line " -- "ending,\nand an [attribute value].\n\n" -- "An [attribute value](@)\n" -- "consists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value]" -- ".\n\n" +- "`=`" +- " character, optional spaces, tabs, and up to one line ending,\n" +- "and an [attribute value].\n\n" +- "An [attribute value](@)\nconsists of an [" +- "unquoted attribute value],\na [single-quoted attribute value]" +- ", or a [double-quoted attribute value].\n\n" - "An [unquoted attribute value](@)\n" - "is a nonempty string of characters not\n" - "including spaces, tabs, line endings, `\"`, `'`" -- ", `=`, `<`, `>`, or `` " -- "` ``.\n\n" -- "A [single-quoted attribute value](@)\n" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`.\n\n" -- "A [double-quoted attribute value](@)\n" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`.\n\n" -- "An [open tag](@) consists of a `<` character, " -- "a [tag name],\nzero or more [attributes]" +- ", `=`, `<`, `>`, or " +- "`` ` ``.\n\n" +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final " +- "`'`.\n\n" +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final " +- "`\"`.\n\n" +- "An [open tag](@) consists of a `<`" +- " character, a [tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional " - "`/` character, and a `>` character.\n\n" - "A [closing tag](@) consists of the string ``.\n\n" -- "An [HTML comment](@) consists of ``, ``, or `<" -- "!--`, a string of\ncharacters not including the string " -- "`-->`, and `-->` (see the\n" +- "An [HTML comment](@) consists of " +- "``, ``" +- ", or ``, and " +- "`-->` (see the\n" - "[HTML spec](https://" - html.spec.whatwg.org/multipage/ - "parsing.html#markup-declaration-open-state)).\n\n" -- "A [processing instruction](@)\nconsists of the string ``" +- "A [processing instruction](@)\nconsists of the string ``" - ", and the string\n`?>`.\n\n" -- "A [declaration](@) consists of the string ``, and the character `>`.\n\n" -- "A [CDATA section](@) consists of\n" -- "the string ``" - ", and the string `]]>`.\n\n" - "An [HTML tag](@) consists of an [open tag" @@ -7819,9 +7861,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A line ending (not in a code span or HTML tag) that " - "is preceded\n" - "by two or more spaces and does not occur at the end of a block\n" -- "is parsed as a [hard line break](@)" -- " (rendered\nin HTML as a `
    `" -- " tag):\n\n" +- "is parsed as a [hard line break](@) (rendered\n" +- "in HTML as a `
    ` tag):\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7829,8 +7870,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "baz

    \n" - "````````````````" - "````````````````\n\n\n" -- "For a more visible alternative, a backslash before the\n" -- "[line ending] may be used instead of two or more spaces:\n\n" +- "For a more visible alternative, a backslash before the\n[line ending]" +- " may be used instead of two or more spaces:\n\n" - "````````````````" - "```````````````` " - "example\n" @@ -7999,7 +8040,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "In this appendix we describe some features of the parsing strategy\n" - "used in the CommonMark reference implementations.\n\n" - "## Overview\n\nParsing has two phases:\n\n" -- "1. In the first phase, lines of input are consumed and the block\n" +- "1. " +- "In the first phase, lines of input are consumed and the block\n" - "structure of the document---its division into paragraphs, block quotes,\n" - "list items, and so on---" - "is constructed. Text is assigned to these\n" @@ -8013,17 +8055,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "At each point in processing, the document is represented as a tree of\n" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as " -- "**children**" -- ". These children\n" +- "**children**. These children\n" - "may, in turn, have other blocks as children. " - "The last child of a block\nis normally considered **open**" - ", meaning that subsequent lines of input\n" - "can alter its contents. (Blocks that are not open are " -- "**closed**" -- ".)\n" +- "**closed**.)\n" - "Here, for example, is a possible document tree, with the open blocks\n" - "marked by arrows:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph\n" @@ -8034,32 +8075,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Each line that is processed has an effect on this tree. The line is\n" - "analyzed and, depending on its contents, the document may be altered\n" - "in one or more of the following ways:\n\n" -- "1. One or more open blocks may be closed.\n2. " -- "One or more new blocks may be created as children of the\n last open block.\n" -- "3. Text may be added to the last (deepest) open block remaining\n" -- " on the tree.\n\n" +- "1. One or more open blocks may be closed.\n" +- "2. One or more new blocks may be created as children of the\n " +- "last open block.\n" +- "3. Text may be added to the last (deepest) open block remaining\n " +- "on the tree.\n\n" - "Once a line has been incorporated into the tree in this way,\n" - "it can be discarded, so input can be read in a stream.\n\n" - "For each line, we follow this procedure:\n\n" -- "1. First we iterate through the open blocks, starting with the\n" +- "1. " +- "First we iterate through the open blocks, starting with the\n" - "root document, and descending through last children down to the last\n" - "open block. Each block imposes a condition that the line must satisfy\n" - "if the block is to remain open. For example, a block quote requires a\n" -- "`>`" -- " character. A paragraph requires a non-blank line.\n" +- "`>` character. A paragraph requires a non-blank line.\n" - "In this phase we may match all or just some of the open\n" - "blocks. " - "But we cannot close unmatched blocks yet, because we may have a\n[" - "lazy continuation line].\n\n" -- "2. Next, after consuming the continuation markers for existing\n" -- "blocks, we look for new block starts (e.g. `>` " -- "for a block quote).\n" +- "2. " +- "Next, after consuming the continuation markers for existing\n" +- "blocks, we look for new block starts (e.g. `>`" +- " for a block quote).\n" - "If we encounter a new block start, we close any blocks unmatched\n" - "in step 1 before creating the new block as a child of the last\n" - "matched container block.\n\n" -- "3. Finally, we look at the remainder of the line (after block\n" -- "markers like `>`, list markers, and indentation have been consumed" -- ").\nThis is text that can be incorporated into the last open\n" +- "3. " +- "Finally, we look at the remainder of the line (after block\nmarkers like " +- "`>`, list markers, and indentation have been consumed).\n" +- "This is text that can be incorporated into the last open\n" - "block (a paragraph, code block, heading, or raw HTML)" - ".\n\n" - "Setext headings are formed when we see a line of a paragraph\n" @@ -8069,8 +8113,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one or more reference link definitions. Any remainder becomes a\nnormal paragraph.\n\n" - "We can see how this works by considering how the tree above is\n" - "generated by four lines of Markdown:\n\n" -- "``` markdown\n> Lorem ipsum dolor\n" -- "sit amet.\n" +- "``` markdown\n" +- "> Lorem ipsum dolor\nsit amet.\n" - "> - Qui *quodsi iracundia*\n" - "> - aliquando id\n```\n\n" - "At the outset, our document model is just\n" @@ -8078,41 +8122,42 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\nThe first line of our text,\n" - "\n``` markdown\n> Lorem ipsum dolor\n```" - "\n\ncauses a `block_quote` block to be created as a child of our\n" -- "open `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" -- ". Then the text is added to the last open\nblock, the `paragraph`" -- ":\n\n" -- "``` tree\n-> document\n -> block_quote\n -> paragraph\n" +- "open `document` block, and a `paragraph` block as a child of\n" +- "the `block_quote`. Then the text is added to the last open\n" +- "block, the `paragraph`:\n\n" +- "``` tree\n" +- "-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\"\n```\n\nThe next line,\n" - "\n``` markdown\nsit amet.\n```" -- "\n\n" -- "is a \"lazy continuation\" of the open `paragraph`, so it gets " -- "added\nto the paragraph's text:\n\n" -- "``` tree\n-> document\n -> block_quote\n -> paragraph\n" +- "\n\nis a \"lazy continuation\" of the open `paragraph`" +- ", so it gets added\nto the paragraph's text:\n\n" +- "``` tree\n" +- "-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - "```\n\nThe third line,\n" - "\n``` markdown\n" - "> - Qui *quodsi iracundia*\n" - "```\n\n" -- "causes the `paragraph` block to be closed, and a new `list` " -- "block\nopened as a child of the `block_quote`. A " +- "causes the `paragraph` block to be closed, and a new `list`" +- " block\nopened as a child of the `block_quote`. A " - "`list_item` is also\nadded as a child of the `list`" - ", and a `paragraph` as a child of\nthe `list_item`" - ". The text is then added to the new `paragraph`:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-" - ")\n -> list_item\n -> paragraph\n" - " \"Qui *quodsi iracundia*\"\n" - "```\n\nThe fourth line,\n" - "\n``` markdown\n> - aliquando id\n```" -- "\n\n" -- "causes the `list_item` (and its child the `paragraph`) " -- "to be closed,\nand a new `list_item`" -- " opened up as child of the `list`. A `paragraph`" -- "\nis added as a child of the new `list_item`" +- "\n\ncauses the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" +- " opened up as child of the `list`. A `paragraph`\n" +- "is added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph\n" @@ -8125,16 +8170,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "string contents of paragraphs and headings as inlines. At this\n" - "point we have seen all the link reference definitions, so we can\n" - "resolve reference links as we go.\n\n" -- "``` tree\ndocument\n block_quote\n paragraph\n" -- " str \"Lorem ipsum dolor\"\n softbreak\n" -- " str \"sit amet.\"\n" +- "``` tree\n" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n" +- " softbreak\n str \"sit amet.\"\n" - " list (type=bullet tight=true bullet_char=-)\n" - " list_item\n paragraph\n str \"Qui \"\n emph\n" - " str \"quodsi iracundia\"\n list_item\n paragraph\n" - " str \"aliquando id\"\n```\n\n" - "Notice how the [line ending] in the first paragraph has\n" -- "been parsed as a `softbreak`, and the asterisks " -- "in the first list item\nhave become an `emph`.\n\n" +- "been parsed as a `softbreak`" +- ", and the asterisks in the first list item\nhave become an " +- "`emph`.\n\n" - "### An algorithm for parsing nested emphasis and links\n\n" - "By far the trickiest part of inline parsing is handling emphasis,\n" - "strong emphasis, links, and images. This is done using the following\nalgorithm.\n\n" @@ -8142,47 +8188,53 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- a run of `*` or `_` characters, or\n" - "- a `[` or `![`\n\n" - "we insert a text node with these symbols as its literal content, and we\n" -- "add a pointer to this text node to the [delimiter stack]" -- "(@).\n\n" +- "add a pointer to this text node to the " +- "[delimiter stack](@).\n\n" - "The [delimiter stack] is a doubly linked list. Each\n" - "element contains a pointer to a text node, plus information about\n\n" -- "- the type of delimiter (`[`, `![" -- "`, `*`, `_`)\n" +- "- the type of delimiter (`[`, `![`" +- ", `*`, `_`)\n" - "- the number of delimiters,\n" -- "- whether the delimiter is \"active\" (all are active to start" -- "), and\n" -- "- whether the delimiter is a potential opener, a potential closer,\n" -- " or both (which depends on what sort of characters precede\n " +- "- whether the delimiter is \"active\"" +- " (all are active to start), and\n" +- "- whether the delimiter is a potential opener, a potential closer,\n " +- "or both (which depends on what sort of characters precede\n " - "and follow the delimiters).\n\n" -- "When we hit a `]` character, we call the *look for link " -- "or image*\nprocedure (see below).\n\n" +- "When we hit a `]` character, we call the " +- "*look for link or image*\nprocedure (see below).\n\n" - "When we hit the end of the input, we call the *process emphasis*\n" -- "procedure (see below), with `stack_bottom` = NULL" -- ".\n\n" +- "procedure (see below), with `stack_bottom`" +- " = NULL.\n\n" - "#### *look for link or image*\n\n" - "Starting at the top of the delimiter stack, we look backwards\n" - "through the stack for an opening `[` or `![`" - " delimiter.\n\n" - "- If we don't find one, we return a literal text node `" - "]`.\n\n" -- "- If we do find one, but it's not *active*, " -- "we remove the inactive\n " +- "- " +- "If we do find one, but it's not *active*" +- ", we remove the inactive\n " - "delimiter from the stack, and return a literal text node `]`" - ".\n\n" -- "- If we find one and it's active, then we parse ahead " -- "to see if\n " +- "- " +- "If we find one and it'" +- "s active, then we parse ahead to see if\n " - "we have an inline link/image, reference link/image, collapsed reference\n " - "link/image, or shortcut reference link/image.\n\n " -- "+ If we don't, then we remove the opening delimiter from " -- "the\n delimiter stack and return a literal text node `]`.\n\n " -- "+ If we do, then\n\n" -- " * We return a link or image node whose children are the inlines\n" -- " after the text node pointed to by the opening delimiter.\n\n " -- "* We run *process emphasis* on these inlines, with the `[" -- "` opener\n as `stack_bottom`.\n\n " +- "+ " +- "If we don't, then we remove the opening delimiter from the\n " +- "delimiter stack and return a literal text node `]`.\n\n " +- "+ If we do, then\n\n " +- "* " +- "We return a link or image node whose children are the inlines\n " +- "after the text node pointed to by the opening delimiter.\n\n " +- "* " +- "We run *process emphasis* on these inlines, with the `[`" +- " opener\n as `stack_bottom`.\n\n " - "* We remove the opening delimiter.\n\n" -- " * If we have a link (and not an image), we also set " -- "all\n `[` delimiters before the opening delimiter to " +- " * " +- "If we have a link (and not an image), we also set all\n " +- "`[` delimiters before the opening delimiter to " - "*inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\n" - "Parameter `stack_bottom` sets a lower bound to how far we\n" @@ -8190,45 +8242,55 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". If it is NULL, we can\n" - "go all the way to the bottom. Otherwise, we stop before\nvisiting " - "`stack_bottom`.\n\n" -- "Let `current_position` point to the element on the [delimiter " -- "stack]\njust above `stack_bottom` (or the first element if " -- "`stack_bottom`\nis NULL).\n\n" +- "Let `current_position` point to the element on the [" +- "delimiter stack]\njust above `stack_bottom`" +- " (or the first element if `stack_bottom`\n" +- "is NULL).\n\n" - "We keep track of the `openers_bottom` for each delimiter\n" -- "type (`*`, `_`), indexed to the length " -- "of the closing delimiter run\n" +- "type (`*`, `_`" +- "), indexed to the length of the closing delimiter run\n" - "(modulo 3) and to whether the closing delimiter can also " - "be an\nopener. Initialize this to `stack_bottom`.\n\n" - "Then we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if " -- "needed)\n until we find the first potential closer with delimiter `*`" -- " or `_`" -- ".\n (This will be the potential closer closest\n to the beginning of the input " -- "-- the first one in parse order.)\n\n" -- "- Now, look back in the stack (staying above `stack_bottom` " -- "and\n the `openers_bottom`" -- " for this delimiter type) for the\n first matching potential opener (\"matching" -- "\" means same delimiter).\n\n- If one is found:\n\n " -- "+ Figure out whether we have emphasis or strong emphasis:\n" -- " if both closer and opener spans have length >= 2, we have\n " +- "- " +- "Move `current_position`" +- " forward in the delimiter stack (if needed)\n " +- "until we find the first potential closer with delimiter `*` or " +- "`_`.\n (This will be the potential closer closest\n " +- to the beginning of the input -- +- " the first one in parse order.)\n\n" +- "- " +- "Now, look back in the stack (staying above `stack_bottom` and\n " +- "the `openers_bottom` for this delimiter type) for the\n " +- "first matching potential opener (\"matching\" means same delimiter).\n\n" +- "- If one is found:\n\n " +- "+ " +- "Figure out whether we have emphasis or strong emphasis:\n " +- "if both closer and opener spans have length >= 2, we have\n " - "strong, otherwise regular.\n\n " -- "+ Insert an emph or strong emph node accordingly, after\n" -- " the text node corresponding to the opener.\n\n " -- "+ Remove any delimiters between the opener and closer from\n" -- " the delimiter stack.\n\n " -- "+ Remove 1 (for regular emph) or 2 (for strong " +- "+ " +- "Insert an emph or strong emph node accordingly, after\n " +- "the text node corresponding to the opener.\n\n " +- "+ " +- "Remove any delimiters between the opener and closer from\n " +- "the delimiter stack.\n\n " +- "+ " +- "Remove 1 (for regular emph) or 2 (for strong " - "emph) delimiters\n " - "from the opening and closing text nodes. If they become empty\n " - "as a result, remove them and remove the corresponding element\n " - "of the delimiter stack. If the closing node is removed, reset\n " - "`current_position` to the next element in the stack.\n\n" -- "- If none is found:\n\n" -- " + Set `openers_bottom` to the element before `current_position" -- "`.\n " +- "- If none is found:\n\n " +- "+ " +- "Set `openers_bottom` to the element before `current_position`" +- ".\n " - "(We know that there are no openers for this kind of closer up to " - "and\n including this point, so this puts a lower bound on future searches.)\n\n " -- "+ If the closer at `current_position` is not a potential opener,\n" -- " remove it from the delimiter stack (since we know it can't\n " +- "+ " +- "If the closer at `current_position` is not a potential opener,\n " +- "remove it from the delimiter stack (since we know it can't\n " - "be a closer either).\n\n " - "+ Advance `current_position` to the next element in the stack.\n\n" -- "After we're done, we remove all delimiters above `" -- "stack_bottom` from the\ndelimiter stack.\n" +- "After we're done, we remove all delimiters above " +- "`stack_bottom` from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap index 6e20ee4..32193e7 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md-2.snap @@ -6,30 +6,35 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" - "# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" +- "```\n" +- "1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n" - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" - "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------\n\n" - "# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------\n\n" - "# Links\n\n" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n" -- "[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n" +- "```\n" +- "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n" +- "```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n" - "\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------\n\n" - "# Images\n\n" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n" +- "```\n" +- "Here's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n" - "[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n" - "\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" - "\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "\n\n```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n" +- "\n\n```php\n" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n" - " $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" - "# Tables\n\n" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n" -- "| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n" -- "| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n" -- "\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" +- "```\n" +- "Colons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n" +- "| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n" +- "\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n" +- "\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n" - "\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------\n\n" - "# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap index 8f15614..9c604aa 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown@github_flavored.md.snap @@ -4,9 +4,9 @@ expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers\n\n" -- "```\n# h1 Heading 8-)\n" -- "## h2 Heading\n### h3 Heading\n" -- "#### h4 Heading\n" +- "```\n" +- "# h1 Heading 8-)\n## h2 Heading\n" +- "### h3 Heading\n#### h4 Heading\n" - "##### h5 Heading\n" - "###### h6 Heading\n\n" - "Alternatively, for H1 and H2, an underline-ish style" @@ -38,15 +38,16 @@ input_file: tests/inputs/markdown/github_flavored.md - "_underscores_.\n\n" - "Strong emphasis, aka bold, with **asterisks** or " - "__underscores__.\n\n" -- Combined emphasis with **asterisks and _underscores_* -- "*.\n\n" -- Strikethrough uses two tildes. ~~Scratch this. -- "~~\n\n**This is bold text**\n" -- "\n__This is bold text__\n\n*This is italic text*\n" -- "\n_This is italic text_\n\n~~Strikethrough~~\n\n" -- "------\n\n" +- "Combined emphasis with " +- "**asterisks and _underscores_**.\n\n" +- "Strikethrough uses two tildes. " +- "~~Scratch this.~~\n\n" +- "**This is bold text**\n\n__This is bold text__\n" +- "\n*This is italic text*\n\n_This is italic text_\n" +- "\n~~Strikethrough~~\n\n------\n\n" - "# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n" +- "```\n" +- "1. First ordered list item\n2. Another item\n" - "⋅⋅* Unordered sub-list.\n1. " - "Actual numbers don't matter, just that it's a number\n" - "⋅⋅1. Ordered sub-list\n4. And another item.\n\n" @@ -77,14 +78,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "+ Very easy!\n```\n\n" - "1. First ordered list item\n" - "2. Another item\n⋅⋅* Unordered sub-list.\n" -- "1. " -- "Actual numbers don't matter, just that it's a number\n" -- "⋅⋅1. Ordered sub-list\n4. And another item.\n\n" +- "1. Actual numbers don't matter, just that it'" +- "s a number\n⋅⋅1. Ordered sub-list\n" +- "4. And another item.\n\n" - ⋅⋅⋅You can have properly indented paragraphs within list items - ". " - "Notice the blank line above, and the leading spaces (at least one, " -- "but we'll use three here to also align the raw Markdown)" -- ".\n\n" +- "but we'" +- "ll use three here to also align the raw Markdown).\n\n" - "⋅⋅⋅To have a line break without a paragraph, you will need " - "to use two trailing spaces.⋅⋅\n" - "⋅⋅⋅Note that this line is separate, but within the same paragraph" @@ -93,21 +94,24 @@ input_file: tests/inputs/markdown/github_flavored.md - "where trailing spaces are not required.)\n\n" - "* Unordered list can use asterisks\n- Or minuses\n" - "+ Or pluses\n\n" -- "1. Make my changes\n 1. Fix bug\n 2. " -- "Improve formatting\n - Make the headings bigger\n" +- 1. Make my changes +- "\n 1. Fix bug\n" +- " 2. Improve formatting\n - Make the headings bigger\n" - "2. Push my commits to GitHub\n" -- "3. Open a pull request\n * Describe my changes\n" +- "3. Open a pull request\n " +- " * Describe my changes\n" - " * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-" - "`, or `*`\n" -- "+ Sub-lists are made by indenting 2 spaces:\n" -- " - Marker character change forces new list start:\n" -- " * Ac tristique libero volutpat at\n " +- "+ Sub-lists are made by indenting 2 spaces:\n " +- "- Marker character change forces new list start:" +- "\n * Ac tristique libero volutpat at\n " - "+ Facilisis in pretium nisl aliquet\n " - "- Nulla volutpat aliquam velit\n" - "+ Very easy!\n\n------\n\n" - "# Task lists\n\n" -- "```\n- [x] Finish my changes\n" +- "```\n" +- "- [x] Finish my changes\n" - "- [ ] Push my commits to GitHub\n" - "- [ ] Open a pull request\n" - "- [x] @mentions, #refs, [links]()" @@ -119,12 +123,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [x] Finish my changes\n" - "- [ ] Push my commits to GitHub\n" - "- [ ] Open a pull request\n" -- "- [x] @mentions, #refs, [links]()" -- ", **formatting**, and tags supported\n" -- "- [x] list syntax required (any unordered or ordered list supported" -- ")\n- [ ] this is a complete item\n" -- "- [ ] this is an incomplete item\n\n------\n\n" +- "- " +- "[x] @mentions, #refs, [links](), " +- "**formatting**, and tags" +- " supported\n" +- "- " +- "[x] list syntax required (any unordered or ordered list supported)\n" +- "- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n" +- "------\n\n" - "# Ignoring Markdown formatting\n\n" - "You can tell GitHub to ignore (or escape) Markdown " - "formatting by using \\ before the Markdown character.\n\n" @@ -171,8 +177,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "\n" - URLs and URLs in angle brackets will automatically get turned into links - ".\nhttp://www.example.com or " -- "" -- " and sometimes\n" +- " and sometimes\n" - "example.com (but not on Github, for example).\n\n" - "Some text to show that the reference links can follow later.\n" - "\n" @@ -206,24 +211,27 @@ input_file: tests/inputs/markdown/github_flavored.md - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"\n```\n\n" - "Here's our logo (hover to see the title text):\n" -- "\nInline-style:\n![" -- "alt text](https://github.com/" -- adam-p/markdown-here/raw/master/src -- "/common/images/icon48.png \"Logo Title Text 1" -- "\")\n\nReference-style:\n![alt text][logo]\n" +- "\nInline-style:\n" +- "![" +- alt text +- "](https://github.com/adam-p" +- /markdown-here/raw/master/src/common/images +- "/icon48.png \"Logo Title Text 1\")\n\n" +- "Reference-style:\n![alt text][logo]\n" - "\n" - "[logo]: https://github.com/adam" - "-p/markdown-here/raw/master/src/common" - "/images/icon48.png \"Logo Title Text 2\"\n\n" - "![" -- "Minion](https://" -- octodex.github.com/images/ -- "minion.png)\n" +- Minion +- "](https://octodex.github.com" +- "/images/minion.png)\n" - "![" -- "Stormtroopocat](https://" -- octodex.github.com/images/ -- "stormtroopocat.jpg \"The Stormtroopocat" -- "\")\n\nLike links, Images also have a footnote style syntax\n" +- Stormtroopocat +- "](https://octodex.github.com" +- "/images/stormtroopocat.jpg \"The " +- "Stormtroopocat\")\n\n" +- "Like links, Images also have a footnote style syntax\n" - "\n![Alt text][id]\n" - "\nWith a reference later in the document defining the URL location:\n" - "\n" @@ -231,9 +239,11 @@ input_file: tests/inputs/markdown/github_flavored.md - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"\n\n" - "------\n\n" -- "# [Footnotes](https://github.com/" +- "# " +- "[Footnotes](https://github.com/" - "markdown-it/markdown-it-footnote)\n\n" -- "```\nFootnote 1 link[^first].\n\n" +- "```\n" +- "Footnote 1 link[^first].\n\n" - "Footnote 2 link[^second].\n\n" - "Inline footnote^[Text of inline footnote] definition.\n\n" - "Duplicated footnote reference[^second].\n\n" @@ -251,7 +261,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Inline `code` has `back-ticks around` it.\n" - "```\n\n" - "Inline `code` has `back-ticks around` it.\n" -- "\n```c#\nusing System.IO.Compression;\n\n" +- "\n```c#\n" +- "using System.IO.Compression;\n\n" - "#pragma warning disable 414, 3021\n\n" - "namespace MyApplication\n{\n" - " [Obsolete(\"...\")]\n" @@ -262,7 +273,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "\");\n" - " return new List(new int[] { 1, " - "2, 3 })\n }\n }\n}\n```\n\n" -- "```css\n@font-face {\n" +- "```css\n" +- "@font-face {\n" - " font-family: Chunkfive; src: url('" - "Chunkfive.otf');\n}\n\n" - "body, .usertext {\n" @@ -313,7 +325,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "__halt_compiler () ; datahere\ndatahere\ndatahere */\n" - "datahere\n```\n\n------\n\n" - "# Tables\n\n" -- "```\nColons can be used to align columns.\n\n" +- "```\n" +- "Colons can be used to align columns.\n\n" - "| Tables | Are | Cool |\n" - "| ------------- |:" - "-------------:| -" @@ -345,7 +358,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n| --- | --- |\n" - "| Backtick | ` |\n| Pipe | \\| |\n```\n\n" - "Colons can be used to align columns.\n\n" -- "| Tables | Are | Cool |\n" +- "| Tables | Are | Cool " +- "|\n" - "| ------------- |:" - "-------------:| -" - "----:|\n" @@ -353,15 +367,16 @@ input_file: tests/inputs/markdown/github_flavored.md - "| col 2 is | centered | $12 |\n" - "| zebra stripes | are neat | $1 |\n" - "\nThere must be at least 3 dashes separating each header cell.\n" -- "The outer pipes (|) are optional, and you don't need to " -- "make the\n" +- "The outer pipes (|) are optional, and you don'" +- "t need to make the\n" - "raw Markdown line up prettily. " - "You can also use inline Markdown.\n\n" -- "Markdown | Less | Pretty\n" -- "--- | --- | ---\n" +- Markdown | Less | Pretty +- "\n--- | --- | ---\n" - "*Still* | `renders` | **nicely**\n" - "1 | 2 | 3\n\n" -- "| First Header | Second Header |\n" +- "| First Header | Second Header " +- "|\n" - "| ------------- | -" - "------------ |\n" - "| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n" @@ -370,9 +385,10 @@ input_file: tests/inputs/markdown/github_flavored.md - "| git diff | Show file differences that haven't been staged |\n\n" - "| Command | Description |\n| --- | --- |\n" - "| `git status` | List all *new or modified* files |\n" -- "| `git diff` | Show file differences that **" -- "haven't been** staged |\n\n" -- "| Left-aligned | Center-aligned | Right-aligned |\n" +- "| `git diff` |" +- " Show file differences that **haven't been** staged |\n\n" +- "| Left-aligned | Center-aligned | Right-aligned " +- "|\n" - "| :--- | :---: | ---: " - "|\n| git status | git status | git status |\n" - "| git diff | git diff | git diff |\n\n" @@ -393,19 +409,23 @@ input_file: tests/inputs/markdown/github_flavored.md - ">> ...by using additional greater-than signs right next to each " - "other...\n> > > ...or with spaces between arrows.\n" - "```\n\n" -- "> Blockquotes are very handy in email to emulate reply text.\n" -- "> This line is part of the same quote.\n\nQuote break.\n\n" -- "> This is a very long line that will still be quoted properly when it wraps" -- ". Oh boy let'" +- "> " +- "Blockquotes are very handy in email to emulate reply text.\n> " +- "This line is part of the same quote.\n\nQuote break.\n\n" +- "> " +- "This is a very long line that will still be quoted properly when it wraps. " +- "Oh boy let'" - "s keep writing to make sure this is long enough to actually wrap for everyone. " -- "Oh, you can *put* **Markdown** into a " -- "blockquote.\n\n" -- "> Blockquotes can also be nested...\n" -- ">> ...by using additional greater-than signs right next to each " -- "other...\n> > > ...or with spaces between arrows.\n\n" +- "Oh, you can *put* **Markdown**" +- " into a blockquote.\n\n" +- "> Blockquotes can also be nested...\n>" +- "> " +- "...by using additional greater-than signs right next to each other" +- "...\n> > > ...or with spaces between arrows.\n\n" - "------\n\n" - "# Inline HTML\n\n" -- "```\n
    \n" +- "```\n" +- "
    \n" - "
    Definition list
    \n" - "
    Is something people use sometimes.
    \n\n" - "
    Markdown in HTML
    \n" @@ -420,8 +440,9 @@ input_file: tests/inputs/markdown/github_flavored.md - Use HTML tags.\n
    \n\n------\n\n" - "# Horizontal Rules\n\n" -- "```\nThree or more...\n\n---\n\nHyphens\n\n" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\n" +- "```\n" +- "Three or more...\n\n---\n\nHyphens\n\n***\n\n" +- "Asterisks\n\n___\n\nUnderscores\n```\n\n" - "Three or more...\n\n---\n\nHyphens\n\n***\n" - "\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" - "# YouTube Videos\n\n" @@ -448,7 +469,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "jpg\" alt=\"IMAGE ALT TEXT " - "HERE\" width=\"240\" height=\"180\" border=" - "\"10\">\n\n" -- "\n```\n[![" +- "\n```\n" +- "[![" - "IMAGE ALT TEXT HERE](http:/" - /img.youtube.com/vi/ - YOUTUBE_VIDEO_ID_HERE/0. @@ -457,10 +479,10 @@ input_file: tests/inputs/markdown/github_flavored.md - "v=YOUTUBE_VIDEO_ID_HERE)\n" - "```\n\n" - "[![" -- "IMAGE ALT TEXT HERE](https:/" -- /upload.wikimedia.org/wikipedia/ -- commons/thumb/e/ef/YouTube_logo_2015. -- svg/1200px-YouTube_logo_2015. -- "svg.png)](https://" -- www.youtube.com/watch? +- IMAGE ALT TEXT HERE +- "](https://upload.wikimedia.org/" +- wikipedia/commons/thumb/e/ef/ +- YouTube_logo_2015.svg/1200px- +- "YouTube_logo_2015.svg.png)](https" +- "://www.youtube.com/watch?" - "v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap index f33309a..643c68a 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md-2.snap @@ -10,11 +10,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\nThis paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:" - "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n This paragraph belongs to item two of the outer list.\n```\n\nThe AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document." - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank" -- " lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)" -- "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```" -- "14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```\n\nIn the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away." +- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)" +- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```" +- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```" +- "11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```" +- "In the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away." - "## About this document\n\nThis document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\nof representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer." - "Note that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\nimplementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\npercent-encode non-ASCII characters in URLs.\n\nThis document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs." - "# Preliminaries" @@ -117,10 +118,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n``` aa ```\nfoo\n.\n

    aa\nfoo

    \n````````````````````````````````\n\n\n[Info strings] for tilde code blocks can contain backticks and tildes:\n\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
    foo\n
    \n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:" - "```````````````````````````````` example\n```\n``` aaa\n```\n.\n
    ``` aaa\n
    \n````````````````````````````````" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line." -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string" -- "``." -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``." +- "5. **Start condition:** line begins with the string\n``." +- "6." +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:"
     - "```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following." - "All types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
    \n hi\n
    \n\nokay.\n.\n\n \n \n \n
    \n hi\n
    \n

    okay.

    \n````````````````````````````````" @@ -193,8 +195,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
    \n
    \n
    \n

    foo\nbar\nbaz

    \n
    \n
    \n
    \n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:" - "```````````````````````````````` example\n> code\n\n> not code\n.\n
    \n
    code\n
    \n
    \n
    \n

    not code

    \n
    \n````````````````````````````````" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2." -- "If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\nFor example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:" +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" +- " 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." +- "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:" - "```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n````````````````````````````````\n\n\nThe most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem." - "Here are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" - "```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • one
    • \n
    \n
     two\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" @@ -283,12 +286,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." - " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)" -- " iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3." - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example," -- " `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`." +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`." +- "16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`." - "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" - "This is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:" - "```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````" @@ -490,5 +493,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter." - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:" +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap index c6e1c0c..03ebc32 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@commonmark_spec.md.snap @@ -3,8 +3,9 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/commonmark_spec.md --- -- "---\ntitle: CommonMark Spec" -- "author: John MacFarlane\nversion: '0.31.2'" +- "---" +- "title: CommonMark Spec\nauthor: John MacFarlane" +- "version: '0.31.2'" - "date: '2024-01-28'" - "license: '[CC-BY-SA 4.0](https" - "://creativecommons.org/licenses/by-sa/" @@ -17,8 +18,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - help from Aaron Swartz) and released in 2004 in the form of a - "[syntax description](https://daringfireball.net/projects" - "/markdown/syntax)\nand a Perl script (" -- "`Markdown.pl`" -- ) for converting Markdown to +- "`Markdown.pl`) for converting Markdown to" - "HTML. In the next decade, dozens of implementations were" - developed in many languages. Some extended the original - "Markdown syntax with conventions for footnotes, tables, and" @@ -31,8 +31,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - What distinguishes Markdown from many other lightweight markup - "syntaxes, which are often easier to write, is its readability." - "As Gruber writes:" -- "> The overriding design goal for Markdown's formatting syntax is" -- "> to make it as readable as possible. The idea is that a\n>" +- ">" +- "The overriding design goal for Markdown's formatting syntax is\n>" +- "to make it as readable as possible. The idea is that a\n>" - "Markdown-formatted document should be publishable as-is, as\n>" - "plain text, without looking like it's been marked up with tags\n>" - "or formatting instructions.\n> (" @@ -43,7 +44,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - asciidoc.org/) with - an equivalent sample of Markdown. Here is a sample of - "AsciiDoc from the AsciiDoc manual:" -- "```\n1. List item one.\n+" +- "```" +- "1. List item one.\n+" - "List item one continued with a second paragraph followed by an\nIndented block." - + - "................" @@ -57,7 +59,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This paragraph is part of the preceding list item.\n\nb. List item b." - "This paragraph belongs to item two of the outer list.\n--\n```" - "And here is the equivalent in Markdown:" -- "```\n1. List item one." +- "```" +- 1. List item one. - " List item one continued with a second paragraph followed by an\n Indented block." - $ ls *.sh - $ mv *.sh ~/tmp @@ -73,44 +76,49 @@ input_file: tests/inputs/markdown/commonmark_spec.md - to read. The nesting of list items is apparent to the eye in the - "source, not just in the processed document." - "## Why is a spec needed?" -- "John Gruber's [canonical description of Markdown's" -- "syntax](https://daringfireball.net/projects/" -- markdown/syntax) +- "John Gruber's" +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown" +- /syntax) - does not specify the syntax unambiguously. Here are some examples of - "questions it does not answer:" -- 1. How much indentation is needed for a sublist? +- "1." +- How much indentation is needed for a sublist? - The spec says that - "continuation paragraphs need to be indented four spaces, but is" - not fully explicit about sublists. It is natural to think that -- "they, too, must be indented four spaces, but `" -- "Markdown.pl` does\n not require that. This is hardly a \"" -- "corner case,\" and divergences" +- "they, too, must be indented four spaces, but" +- "`Markdown.pl` does\n not require that. This is hardly a" +- "\"corner case,\" and divergences" - between implementations on this issue often lead to surprises for - users in real documents. (See -- "[this comment by John" -- "Gruber](https://web.archive.org/web" -- "/20170611172104/http://" +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/" +- "20170611172104/http://" - article.gmane.org/ - gmane.text.markdown.general/1997).) -- 2. Is a blank line needed before a block quote or heading? +- "2." +- Is a blank line needed before a block quote or heading? - "Most implementations do not require the blank line. However," - "this can lead to unexpected results in hard-wrapped text, and" - also to ambiguities in parsing (note that some implementations - "put the heading inside the blockquote, while others do not)." - (John Gruber has also spoken -- "[in favor of requiring the blank" -- "lines](https://web.archive.org/web/" +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/" - "20170611172104/http://" - article.gmane.org/ - gmane.text.markdown.general/2146).) -- 3. Is a blank line needed before an indented code block? -- "(`Markdown.pl` requires it, but this is not mentioned" -- "in the\n documentation, and some implementations do not require it.)" +- "3." +- "Is a blank line needed before an indented code block?\n (" +- "`Markdown.pl`" +- "requires it, but this is not mentioned in the" +- "documentation, and some implementations do not require it.)" - "``` markdown\n paragraph\n code?\n ```" -- 4. What is the exact rule for determining when list items get -- "wrapped in `

    `" -- " tags? Can a list be partially \"loose\" and partially\n \"tight\"" -- "? What should we do with a list like this?" +- "4." +- "What is the exact rule for determining when list items get\n wrapped in" +- "`

    ` tags? Can a list be partially \"loose\"" +- " and partially\n \"tight\"? What should we do with a list like this?" - "``` markdown\n 1. one\n\n 2. two\n 3. three" - " ```\n\n Or this?" - " ``` markdown\n 1. one\n - a\n\n - b\n 2. two" @@ -120,52 +128,57 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "20170611172104/http://" - article.gmane.org/ - gmane.text.markdown.general/2554).) -- 5. Can list markers be indented? +- "5." +- Can list markers be indented? - Can ordered list markers be right-aligned? - "``` markdown\n 8. item 1\n 9. item 2" -- " 10. item 2a\n ```" -- "6. Is this one list with a thematic break in its second item," +- "10. item 2a\n ```" +- "6." +- "Is this one list with a thematic break in its second item," - or two lists separated by a thematic break? - "``` markdown\n * a\n * * * * *\n * b" - "```" -- "7. When list markers change from numbers to bullets, do we have" +- "7." +- "When list markers change from numbers to bullets, do we have" - "two lists or one? (The Markdown syntax description suggests two," - but the perl scripts and many other implementations produce one.) - "``` markdown\n 1. fee\n 2. fie\n - foe" -- " - fum\n ```" -- 8. What are the precedence rules for the markers of inline structure? +- "- fum\n ```" +- "8." +- What are the precedence rules for the markers of inline structure? - "For example, is the following a valid link, or does the code span" - take precedence ? - "``` markdown" - "[a backtick (`)](/url) and [another" - "backtick (`)](/url).\n ```" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis?" -- "For example, how should the following be parsed?" +- "9." +- What are the precedence rules for markers of emphasis and strong +- "emphasis? For example, how should the following be parsed?" - "``` markdown\n *foo *bar* baz*" - "```" -- 10. What are the precedence rules between block-level and inline-level +- "10." +- What are the precedence rules between block-level and inline-level - "structure? For example, how should the following be parsed?" - "``` markdown" - "- `a long code span can contain a hyphen like this" - " - and it can screw things up`\n ```" -- 11. Can list items include section headings? -- "(`Markdown.pl`" +- "11." +- "Can list items include section headings? (`Markdown.pl`" - does not - "allow this, but does allow blockquotes to include headings.)" - "``` markdown\n - # Heading\n ```" -- "12. Can list items be empty?\n\n ``` markdown\n * a\n *" -- " * b\n ```" +- 12. Can list items be empty? +- " ``` markdown\n * a\n *\n * b\n ```" - 13. Can link references be defined inside block quotes or list items? - " ``` markdown\n > Blockquote [foo].\n >" -- " > [foo]: /url\n ```" +- "> [foo]: /url\n ```" - "14. If there are multiple definitions for the same reference, which takes\n precedence?" - " ``` markdown\n [foo]: /url1" -- " [foo]: /url2\n\n [foo][]" +- "[foo]: /url2\n\n [foo][]" - "```" -- "In the absence of a spec, early implementers consulted `" -- "Markdown.pl`\nto resolve these ambiguities. But" -- "`Markdown.pl`" -- "was quite buggy, and" +- "In the absence of a spec, early implementers consulted" +- "`Markdown.pl`\nto resolve these ambiguities. But" +- "`Markdown.pl` was quite buggy, and" - "gave manifestly bad results in many cases, so it was not a" - satisfactory replacement for a spec. - "Because there is no unambiguous spec, implementations have diverged" @@ -174,14 +187,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - GitHub wiki) - "renders differently on another (say, converting to docbook using" - "pandoc). To make matters worse, because nothing in Markdown counts" -- "as a \"syntax error,\" the divergence often isn't discovered right" -- away. +- "as a \"syntax error,\" the divergence often isn'" +- t discovered right away. - "## About this document" - This document attempts to specify Markdown syntax unambiguously. - It contains many examples with side-by-side Markdown and - HTML. These are intended to double as conformance tests. An -- "accompanying script `spec_tests.py` can be used to run" -- "the tests\nagainst any Markdown program:" +- "accompanying script `spec_tests.py`" +- " can be used to run the tests\nagainst any Markdown program:" - python test/spec_tests.py --spec - spec.txt --program PROGRAM - Since this document describes how Markdown is to be parsed into @@ -200,55 +213,57 @@ input_file: tests/inputs/markdown/commonmark_spec.md - non-ASCII characters in URLs). - "But a conforming implementation\ncan use a different renderer and may choose not to" - percent-encode non-ASCII characters in URLs. -- "This document is generated from a text file, `spec.txt" -- "`, written" +- "This document is generated from a text file," +- "`spec.txt`, written" - in Markdown with a small extension for the side-by-side tests. - "The script `tools/makespec.py` can be used to convert" -- "`spec.txt`" -- into +- "`spec.txt` into" - HTML or CommonMark (which can then be converted into other formats - ). - "In the examples, the `→` character is used to represent tabs." - "# Preliminaries" - "## Characters and lines" - "Any sequence of [characters] is a valid CommonMark\ndocument." -- "A [character](@) is a Unicode code point." -- "Although some\ncode points (for example, combining accents) do not correspond to" +- "A [character](@)" +- is a Unicode code point. Although some +- "code points (for example, combining accents) do not correspond to" - "characters in an intuitive sense, all code points count as characters" - for purposes of this spec. -- This spec does not specify an encoding; it thinks of lines as composed -- "of [characters]" -- rather than bytes. A conforming parser may be limited +- "This spec does not specify an encoding; it thinks of lines as composed\nof" +- "[characters] rather than bytes. A conforming parser may be limited" - to a certain encoding. - "A [line](@) is a sequence of zero or more [characters" - "]\nother than line feed (`U+000A`" -- ") or carriage return (`U+000D`" -- "),\nfollowed by a [line ending] or by the end of file." -- "A [line ending](@) is a line feed (`U+" -- "000A`), a carriage return\n(`U+000D`" +- ") or carriage return (`U+000D`),\nfollowed by a" +- "[line ending] or by the end of file." +- "A [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`" - ") not followed by a line feed, or a carriage return and a" - following line feed. -- "A line containing no characters, or a line containing only spaces" -- "(`U+0020`) or tabs (`U+" -- "0009`), is called a [blank line](@)." +- "A line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (" +- "`U+0009`), is called a" +- "[blank line](@)." - "The following definitions of character classes will be used in this spec:" -- "A [Unicode whitespace character](@) is a character in" -- "the Unicode `Zs` general\ncategory, or a tab (" -- "`U+0009`), line feed (`U+000A" -- "`), form feed (`U+000C`" -- "), or\ncarriage return (`U+000D`)." -- "[Unicode whitespace](@) is a sequence of one or" -- "more\n[Unicode whitespace characters]." +- "A [Unicode whitespace character](@)" +- "is a character in the Unicode `Zs` general" +- "category, or a tab (`U+0009`" +- "), line feed (`U+000A`), form feed (" +- "`U+000C`), or\ncarriage return (" +- "`U+000D`)." +- "[Unicode whitespace](@)" +- " is a sequence of one or more\n[Unicode whitespace characters]." - "A [tab](@) is `U+0009`." - "A [space](@) is `U+0020`." -- "An [ASCII control character](@) is a character between `" -- "U+0000–1F` (both\nincluding) or" +- "An [ASCII control character](@) is a character between" +- "`U+0000–1F` (both\nincluding) or" - "`U+007F`." -- "An [ASCII punctuation character](@)" -- "is `!`, `\"`, `#`, `$`" -- ", `%`, `&`, `'`, `(`" -- ", `)`,\n`*`, `+`, `,`" -- ", `-`, `.`, `/`" +- "An [ASCII punctuation character](@)\nis" +- "`!`, `\"`, `#`, `$`," +- "`%`, `&`, `'`, `(`," +- "`)`,\n`*`, `+`, `,`," +- "`-`, `.`, `/`" - " (U+0021–2F), \n`:`," - "`;`, `<`, `=`, `>`," - "`?`, `@`" @@ -257,9 +272,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`` ` `` (U+005B–0060)," - "`{`, `|`, `}`, or `~`" - (U+007B–007E). -- "A [Unicode punctuation character](@) is a" -- "character in the Unicode `P`\n(puncuation) or" -- "`S` (symbol) general categories." +- "A [Unicode punctuation character](@)" +- "is a character in the Unicode `P`" +- "(puncuation) or `S` (symbol) general categories." - "## Tabs" - "Tabs in lines are not expanded to [spaces]. However," - "in contexts where spaces help to define block structure," @@ -314,12 +329,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "Normally the `>` that begins a block quote may be followed" - "optionally by a space, which is not considered part of the" -- "content. In the following case `>`" -- "is followed by a tab," +- "content. In the following case `>` is followed by a tab," - which is treated as if it were expanded into three spaces. - "Since one of these spaces is considered part of the\ndelimiter," -- "`foo`" -- is considered to be indented six spaces +- "`foo` is considered to be indented six spaces" - "inside the block quote context, so we get an indented" - code block starting with two spaces. - "````````````````" @@ -367,8 +380,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Insecure characters" -- "For security reasons, the Unicode character `U+0000` must" -- "be replaced\nwith the REPLACEMENT CHARACTER (" +- "For security reasons, the Unicode character `U+0000`" +- must be replaced +- with the REPLACEMENT CHARACTER ( - "`U+FFFD`)." - "## Backslash escapes" - "Any ASCII punctuation character may be backslash-escaped:" @@ -508,18 +522,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Valid HTML entity references and numeric character references - "can be used in place of the corresponding Unicode character," - "with the following exceptions:" -- "- Entity and character references are not recognized in code" -- blocks and code spans. -- "- Entity and character references cannot stand in place of" -- " special characters that define structural elements in\n CommonMark. For example, although" +- "-" +- "Entity and character references are not recognized in code\n blocks and code spans." +- "-" +- Entity and character references cannot stand in place of +- "special characters that define structural elements in\n CommonMark. For example, although" - "`*` can be used\n in place of a literal" - "`*` character, `*` cannot replace\n `*`" - " in emphasis delimiters, bullet list markers, or thematic\n breaks." - Conforming CommonMark parsers need not store information about - whether a particular character was represented in the source - using a Unicode character or an entity reference. -- "[Entity references](@) consist of `&` + any" -- "of the valid\nHTML5 entity names + `;`. The\ndocument" +- "[Entity references](@) consist of `&`" +- " + any of the valid\nHTML5 entity names + `;`" +- ". The\ndocument" - "\nis used as an authoritative source for the valid entity" - references and their corresponding code points. @@ -534,16 +550,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ∲ ≧̸

    - "````````````````" - "````````````````" -- "[Decimal numeric character\nreferences](@)" -- "consist of `&#` + a string of 1--7" -- "arabic digits + `;`" -- ". A\nnumeric character reference is parsed as the corresponding" +- "[Decimal numeric character\nreferences](@)\nconsist of" +- "`&#` + a string of 1--" +- "7 arabic digits + `;`. A" +- numeric character reference is parsed as the corresponding - Unicode character. - Invalid Unicode code points will be replaced by -- "the REPLACEMENT CHARACTER (`U+" -- "FFFD`). For security reasons,\nthe code point" -- "`U+0000` will also be replaced by `U+" -- "FFFD`." +- the REPLACEMENT CHARACTER ( +- "`U+FFFD`). For security reasons,\nthe code point" +- "`U+0000` will also be replaced by" +- "`U+FFFD`." - "````````````````" - "````````````````" - example @@ -551,10 +567,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "0;\n.\n

    # Ӓ Ϡ �

    " - "````````````````" - "````````````````" -- "[Hexadecimal numeric character" -- "references](@) consist of `&#` +\neither `X`" -- "or `x` + a string of 1-6 hexadecimal digits" -- "+ `;`" +- "[Hexadecimal numeric character\nreferences](@) consist of" +- "`&#` +\neither `X` or `x`" +- "+ a string of 1-6 hexadecimal digits + `;`" - ".\nThey too are parsed as the corresponding Unicode character (this" - time specified with a hexadecimal numeral instead of decimal) - "." @@ -582,8 +597,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - Although HTML5 does accept some entity references -- "without a trailing semicolon (such as `©`), these" -- "are not\nrecognized here, because it makes the grammar too ambiguous:" +- "without a trailing semicolon (such as `©`" +- "), these are not\nrecognized here, because it makes the grammar too ambiguous:" - "````````````````" - "````````````````" - "example\n©\n.\n

    &copy

    " @@ -701,18 +716,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "# Blocks and inlines" -- We can think of a document as a sequence of -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block" - "quotations, lists, headings, rules, and code blocks." - Some blocks (like - block quotes and list items) contain other blocks; others (like -- "headings and paragraphs) contain [inline](@) content-" -- "--text," +- "headings and paragraphs) contain [inline](@) content" +- "---text," - "links, emphasized text, images, code spans, and so on." - "## Precedence" -- "Indicators of block structure always take precedence over indicators\nof inline structure." -- "So, for example, the following is a list with" +- Indicators of block structure always take precedence over indicators +- "of inline structure. So, for example, the following is a list with" - "two items, not a list with one item containing a code span:" - "````````````````" - "````````````````" @@ -732,8 +746,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - one block element does not affect the inline parsing of any other. - "## Container blocks and leaf blocks" - "We can divide blocks into two types:" -- "[container blocks](#container-blocks)" -- ",\nwhich can contain other blocks, and" +- "[container blocks](#container-blocks)," +- "which can contain other blocks, and" - "[leaf blocks](#leaf-blocks),\nwhich cannot." - "# Leaf blocks" - This section describes the different kinds of leaf block that make up a @@ -741,8 +755,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Thematic breaks" - "A line consisting of optionally up to three spaces of indentation, followed" - "by a\nsequence of three or more matching `-`, `_`" -- ", or `*`" -- "characters, each followed" +- ", or `*` characters, each followed" - "optionally by any number of spaces or tabs, forms a" - "[thematic break](@)." - "````````````````" @@ -903,8 +916,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## ATX headings" - "An [ATX heading](@)" - "consists of a string of characters, parsed as inline content, between an" -- "opening sequence of 1--6 unescaped `#` characters and an" -- "optional\nclosing sequence of any number of unescaped `#`" +- "opening sequence of 1--6 unescaped `#`" +- " characters and an optional\nclosing sequence of any number of unescaped `#`" - " characters.\nThe opening sequence of `#`" - "characters must be followed by spaces or tabs, or" - "by the end of line. The optional closing sequence of `#`" @@ -938,8 +951,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ####### foo

    " - "````````````````" - "````````````````" -- "At least one space or tab is required between the `#` characters and" -- "the\nheading's contents, unless the heading is empty. Note that many" +- "At least one space or tab is required between the `#`" +- " characters and the\nheading'" +- "s contents, unless the heading is empty. Note that many" - "implementations currently do not require the space. However, the" - space was required by the - "[original ATX implementation](http://" @@ -1100,13 +1114,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "thematic breaks],\n[list item][list items], or [" - "HTML block][HTML blocks]." - "A [setext heading underline](@) is a sequence of" -- "`=` characters or a sequence of `-` characters, with no more" -- than 3 +- "`=` characters or a sequence of `-`" +- "characters, with no more than 3" - spaces of indentation and any number of trailing spaces or tabs. -- "The heading is a level 1 heading if `=` characters are used in" -- "the [setext heading underline], and a level 2 heading if" -- "`-`" -- characters are used. The contents of the heading are the result +- "The heading is a level 1 heading if `=` characters are used in\nthe" +- "[setext heading underline], and a level 2 heading if" +- "`-`\ncharacters are used. The contents of the heading are the result" - "of parsing the preceding lines of text as CommonMark inline\ncontent." - "In general, a setext heading need not be preceded or followed by a" - "blank line. However, it cannot interrupt a paragraph, so when a" @@ -1232,8 +1245,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- "The setext heading underline cannot be a [lazy continuation" -- "line] in a list item or block quote:" +- "The setext heading underline cannot be a [lazy continuation\nline]" +- "in a list item or block quote:" - "````````````````" - "````````````````" - example @@ -1319,8 +1332,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- "If you want a heading with `> foo` as its literal text," -- "you can\nuse backslash escapes:" +- "If you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:" - "````````````````" - "````````````````" - example @@ -1379,10 +1392,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Indented code blocks" -- "An [indented code block](@) is composed of one or" -- "more\n[indented chunks] separated by blank lines.\nAn" -- "[indented chunk](@) is a sequence of non-blank" -- "lines," +- "An [indented code block](@)" +- " is composed of one or more\n[indented chunks]" +- " separated by blank lines.\nAn [indented chunk](@)" +- "is a sequence of non-blank lines," - each preceded by four or more spaces of indentation. - "The contents of the code\nblock are the literal contents of the lines, including trailing" - "[line endings], minus four spaces of indentation." @@ -1502,8 +1515,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "## Fenced code blocks" - "A [code fence](@) is a sequence" -- "of at least three consecutive backtick characters (`` ` ``" -- ") or\ntildes (`~`" +- "of at least three consecutive backtick characters (`` ` ``) or" +- "tildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA" - "[fenced code block](@)" - "begins with a code fence, preceded by up to three spaces of indentation" @@ -1516,8 +1529,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - characters. (The reason for this restriction is that otherwise - some inline code would be incorrectly interpreted as the - beginning of a fenced code block.) -- "The content of the code block consists of all subsequent lines, until" -- "a closing [code fence] of the same type as the code block" +- "The content of the code block consists of all subsequent lines, until\na closing [" +- "code fence] of the same type as the code block" - "began with (backticks or tildes), and with at least as" - many backticks - or tildes as the opening code fence. If the leading code fence is @@ -1789,8 +1802,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo

    - "````````````````" - "````````````````" -- "[Info strings] for tilde code blocks can contain backticks and" -- "tildes:" +- "[Info strings]" +- "for tilde code blocks can contain backticks and tildes:" - "````````````````" - "````````````````" - example @@ -1810,42 +1823,47 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## HTML blocks" -- "An [HTML block](@) is a group of lines that" -- is treated +- "An [HTML block](@)" +- is a group of lines that is treated - as raw HTML (and will not be escaped in HTML output - ). -- "There are seven kinds of [HTML block], which can be defined" -- "by their\nstart and end conditions. The block begins with a line that meets a" -- "[start condition](@) (after up to three optional spaces of" -- "indentation).\nIt ends with the first subsequent line that meets a matching" -- "[end condition](@), or the last line of the document," -- "or the last line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the" -- "[end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line." +- "There are seven kinds of [HTML block]" +- ", which can be defined by their" +- start and end conditions. The block begins with a line that meets a +- "[start condition](@)" +- (after up to three optional spaces of indentation). +- It ends with the first subsequent line that meets a matching +- "[end condition](@)" +- ", or the last line of the document, or the last line of\nthe" +- "[container block](#container-blocks) containing the current HTML" +- "block, if no line is encountered that meets the [end condition]. If" +- "the first line meets both the [start condition] and the [end\ncondition]" +- ", the block will contain just that line." - "1." - "**Start condition:** line begins with the string ``" +- "``" - ", or the end of the line.\\\n**End condition:**" - " line contains an end tag\n`
    `," -- "``, ``, or `" -- "` (case-insensitive; it" +- "``, ``, or" +- "`` (case-insensitive; it" - need not match the start tag). - "2." - "**Start condition:** line begins with the string ``." - "3." -- "**Start condition:** line begins with the string ``." - "4." - "**Start condition:** line begins with the string ``." -- "5. **Start condition:** line begins with the string" +- "5." +- "**Start condition:** line begins with the string" - "``" - "." @@ -1856,23 +1874,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ",\n`article`, `aside`, `base`," - "`basefont`, `blockquote`, `body`" - ",\n`caption`, `center`, `col`," -- "`colgroup`, `dd`, `details`, `" -- "dialog`,\n`dir`, `div`," -- "`dl`, `dt`, `fieldset`, `" -- "figcaption`, `figure`,\n`footer`," +- "`colgroup`, `dd`, `details`," +- "`dialog`,\n`dir`, `div`," +- "`dl`, `dt`, `fieldset`," +- "`figcaption`, `figure`,\n`footer`," - "`form`, `frame`, `frameset`," -- "`h1`, `h2`, `h3`, `" -- "h4`, `h5`, `h6`, `head" -- "`, `header`, `hr`,\n`html`," -- "`iframe`, `legend`, `li`, `" -- "link`, `main`, `menu`, `menuitem`" -- ",\n`nav`, `noframes`, `ol`" -- ", `optgroup`, `option`, `p`, `" -- "param`,\n`search`, `section`, `summary`," -- "`table`, `tbody`, `td`," -- "`tfoot`, `th`, `thead`, `title" -- "`, `tr`, `track`, `ul`" -- ", followed" +- "`h1`, `h2`, `h3`," +- "`h4`, `h5`, `h6`," +- "`head`, `header`, `hr`,\n`html`" +- ", `iframe`, `legend`, `li`," +- "`link`, `main`, `menu`," +- "`menuitem`,\n`nav`, `noframes`" +- ", `ol`, `optgroup`, `option`," +- "`p`, `param`,\n`search`, `section`" +- ", `summary`, `table`, `tbody`," +- "`td`,\n`tfoot`, `th`," +- "`thead`, `title`, `tr`," +- "`track`, `ul`, followed" - "by a space, a tab, the end of the line, the string" - "`>`, or\nthe string `/>`.\\" - "**End condition:** line is followed by a [blank line]" @@ -1880,13 +1898,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "7." - "**Start condition:** line begins with a complete [open tag]" - "(with any [tag name] other than `pre`, `script`" -- ",\n`style`, or `textarea`" -- ") or a complete [closing tag]," +- ",\n`style`, or `textarea`) or a complete [" +- "closing tag]," - "followed by zero or more spaces and tabs, followed by the end of the" -- "line.\\\n**End condition:**" -- "line is followed by a [blank line]." -- HTML blocks continue until they are closed by their appropriate -- "[end condition], or the last line of the document or other" +- "line.\\\n**End condition:** line is followed by a [" +- "blank line]." +- "HTML blocks continue until they are closed by their appropriate\n[end condition]" +- ", or the last line of the document or other" - "[container\nblock](#container-blocks)" - ". This means any HTML" - "**within an HTML\nblock**" @@ -1894,8 +1912,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "be ignored by the parser and passed through as-is, without changing" - "the parser's state." - "For instance, `
    ` within an HTML block started by"
    -- "``"
    -- will not affect
    +- "`
    ` will not affect" - the parser state; as the HTML block was started in by start - "condition 6, it\nwill end at any blank line. This can be surprising:" - "````````````````" @@ -2043,8 +2060,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "int x = 33;\n```" - "````````````````" - "````````````````" -- "To start an [HTML block] with a tag that is *not" -- "* in the" +- "To start an [HTML block] with a tag that is" +- "*not* in the" - "list of block-level tags in (6), you must put the tag" - "by\nitself on the first line (and it must be complete):" - "````````````````" @@ -2081,8 +2098,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`` tag is a nice example. We can surround content with" - "``" - "tags in three different ways. In this case, we get a raw" -- "HTML block, because the `` tag is on a" -- "line by itself:" +- "HTML block, because the ``" +- "tag is on a line by itself:" - "````````````````" - "````````````````" - example @@ -2090,9 +2107,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo*\n" - "````````````````" - "````````````````" -- "In this case, we get a raw HTML block that just includes" -- "the ``" -- tag (because it ends with the following blank +- "In this case, we get a raw HTML block that just includes\nthe" +- "`` tag (because it ends with the following blank" - "line). So the contents get interpreted as CommonMark:" - "````````````````" - "````````````````" @@ -2102,8 +2118,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "Finally, in this case, the `` tags are interpreted" -- "as [raw HTML] *inside*" +- "Finally, in this case, the `` tags are interpreted\nas" +- "[raw HTML] *inside*" - the CommonMark paragraph. (Because - "the tag is not on a line by itself, we get inline" - "HTML\nrather than an [HTML block].)" @@ -2115,10 +2131,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /del>

    - "````````````````" - "````````````````" -- HTML tags designed to contain literal content -- "(`pre`, `script`, `style`, `" -- "textarea`), comments, processing instructions," -- and declarations are treated somewhat differently. +- "HTML tags designed to contain literal content\n(`pre`," +- "`script`, `style`, `textarea`" +- "), comments, processing instructions,\nand declarations are treated somewhat differently." - "Instead of ending at the first blank line, these blocks" - end at the first line containing a corresponding end tag. - "As a result, these blocks can contain blank lines:" @@ -2218,8 +2233,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    baz

    " - "````````````````" - "````````````````" -- Note that anything on the last line after the -- "end tag will be included in the [HTML block]:" +- "Note that anything on the last line after the\nend tag will be included in the" +- "[HTML block]:" - "````````````````" - "````````````````" - example @@ -2286,8 +2301,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "An HTML block of types 1--6 can interrupt a paragraph," -- "and need not be\npreceded by a blank line." +- An HTML block of types 1-- +- "6 can interrupt a paragraph, and need not be\npreceded by a blank line." - "````````````````" - "````````````````" - example @@ -2317,22 +2332,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "This rule differs from John Gruber's original Markdown syntax" - "specification, which says:" -- "> The only restrictions are that block-level HTML elements —" -- "> e.g. `
    `, `
    " -- "`, `
    `, `

    `, etc." -- "— must be separated from\n>" +- ">" +- "The only restrictions are that block-level HTML elements —\n>" +- "e.g. `

    `, `
    `" +- ", `
    `, `

    `" +- ", etc. — must be separated from\n>" - "surrounding content by blank lines, and the start and end tags of the\n>" - block should not be indented with spaces or tabs. -- "In some ways Gruber's rule is more restrictive than the one" -- "given\nhere:" +- "In some ways Gruber'" +- "s rule is more restrictive than the one given\nhere:" - "- It requires that an HTML block be preceded by a blank line." - "- It does not allow the start tag to be indented." - "- It requires a matching end tag, which it also does not allow to" - be indented. -- "Most Markdown implementations (including some of Gruber's own)" -- "do not\nrespect all of these restrictions." -- "There is one respect, however, in which Gruber's rule is" -- "more liberal\nthan the one given here, since it allows blank lines to occur inside" +- "Most Markdown implementations (including some of Gruber'" +- "s own) do not\nrespect all of these restrictions." +- "There is one respect, however, in which Gruber'" +- s rule is more liberal +- "than the one given here, since it allows blank lines to occur inside" - an HTML block. - There are two reasons for disallowing them here. - "First, it removes the need to parse balanced tags, which is" @@ -2360,14 +2377,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - Some Markdown implementations have adopted a convention of - "interpreting content inside tags as text if the open tag has\nthe attribute" -- "`markdown=1`" -- ". The rule given above seems a simpler and" +- "`markdown=1`. The rule given above seems a simpler and" - "more elegant way of achieving the same expressive power, which is also" - much simpler to parse. - The main potential drawback is that one can no longer paste HTML - "blocks into Markdown documents with 100% reliability. However," -- "*in most cases*" -- "this will work fine, because the blank lines in" +- "*in most cases* this will work fine, because the blank lines in" - "HTML are usually followed by HTML block tags. For example:" - "````````````````" - "````````````````" @@ -2379,8 +2394,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "There are problems, however, if the inner tags are indented" -- "*and*" -- "separated by spaces, as then they will be interpreted as" +- "*and* separated by spaces, as then they will be interpreted as" - "an indented code block:" - "````````````````" - "````````````````" @@ -2394,15 +2408,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- "Fortunately, blank lines are usually not necessary and can be\ndeleted." -- "The exception is inside `
    `"
    -- " tags, but as described\n[above][HTML blocks]"
    +- "Fortunately, blank lines are usually not necessary and can be"
    +- "deleted.  The exception is inside `
    ` tags, but as described"
    +- "[above][HTML blocks]"
     - ", raw HTML blocks starting with `
    `\n*can*"
     - contain blank lines.
     - "## Link reference definitions"
    -- "A [link reference definition](@)"
    -- "consists of a [link label], optionally preceded by up to three spaces"
    -- "of\nindentation, followed\nby a colon (`:`"
    +- "A [link reference definition](@)\nconsists of a [link label]"
    +- ", optionally preceded by up to three spaces of\nindentation, followed"
    +- "by a colon (`:`"
     - "), optional spaces or tabs (including up to one\n[line ending]"
     - "), a [link destination],"
     - "optional spaces or tabs (including up to one\n[line ending]"
    @@ -2689,8 +2703,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "## Paragraphs"
     - A sequence of non-blank lines that cannot be interpreted as other
    -- "kinds of blocks forms a [paragraph](@)"
    -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
    +- "kinds of blocks forms a [paragraph](@)."
    +- "The contents of the paragraph are the result of parsing the\nparagraph'"
     - "s raw content as inlines.  The paragraph's raw content"
     - is formed by concatenating the lines and removing initial and final
     - "spaces or tabs.\n\nA simple example with two paragraphs:"
    @@ -2773,51 +2787,52 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - "````````````````"
     - "# Container blocks"
    -- "A [container block](#container-blocks) is a block that has"
    -- "other\nblocks as its contents.  There are two basic kinds of container blocks:\n["
    +- "A [container block](#container-blocks)"
    +- is a block that has other
    +- "blocks as its contents.  There are two basic kinds of container blocks:\n["
     - "block quotes] and [list items].\n[Lists]"
     - "are meta-containers for [list items]."
     - We define the syntax for container blocks recursively.  The general
     - "form of the definition is:"
    -- "> If X is a sequence of blocks, then the result of"
    -- "> transforming X in such-and-such a way is a container of type"
    -- "Y\n> with these blocks as its content."
    +- ">"
    +- "If X is a sequence of blocks, then the result of\n>"
    +- transforming X in such-and-such a way is a container of type Y
    +- "> with these blocks as its content."
     - "So, we explain what counts as a block quote or list item by explaining"
    -- how these can be *generated*
    -- from their contents. This should suffice
    -- "to define the syntax, although it does not give a recipe for *parsing"
    -- "*\nthese constructions.  (A recipe is provided below in the section entitled"
    +- how these can be *generated* from their contents. This should suffice
    +- "to define the syntax, although it does not give a recipe for"
    +- "*parsing*"
    +- these constructions.  (A recipe is provided below in the section entitled
     - "[A parsing strategy](#appendix-a-parsing"
     - "-strategy).)"
     - "## Block quotes"
     - "A [block quote marker](@),"
     - "optionally preceded by up to three spaces of indentation,"
    -- "consists of (a) the character `>`"
    -- together with a following space of
    -- "indentation, or (b) a single character `>` not followed"
    -- "by a space of\nindentation."
    +- "consists of (a) the character `>` together with a following space of"
    +- "indentation, or (b) a single character `>`"
    +- " not followed by a space of\nindentation."
     - "The following rules define [block quotes]:"
    -- 1.  **Basic case.
    -- "**  If a string of lines *Ls* constitute a sequence\n    of blocks"
    -- "*Bs*"
    +- "1."
    +- "**Basic case.**  If a string of lines *Ls*"
    +- " constitute a sequence\n    of blocks *Bs*"
     - ", then the result of prepending a [block quote\n    marker]"
     - " to the beginning of each line in *Ls*\n    is a"
     - "[block quote](#block-quotes) containing *Bs*."
    -- 2.  **Laziness.
    -- "**  If a string of lines *Ls* constitute a"
    -- "[block\n    quote](#block-quotes) with contents *Bs*"
    -- ", then the result of deleting\n    the initial [block quote marker]"
    -- from one or
    +- "2."
    +- "**Laziness.**  If a string of lines"
    +- "*Ls* constitute a [block\n    quote](#block-quotes)"
    +- " with contents *Bs*, then the result of deleting\n    the initial ["
    +- "block quote marker] from one or"
     - more lines in which the next character other than a space or tab after the
     - "[block quote marker] is [paragraph continuation\n    text] is a block quote with"
     - "*Bs* as its content."
    -- "[Paragraph continuation text](@)"
    -- is text
    +- "[Paragraph continuation text](@) is text"
     - "that will be parsed as part of the content of a paragraph, but does"
     - not occur at the beginning of the paragraph.
    -- 3.  **Consecutiveness.
    -- "**  A document cannot contain two [block\n    quotes]"
    -- "in a row unless there is a [blank line] between them."
    +- "3."
    +- "**Consecutiveness.**  A document cannot contain two ["
    +- "block\n    quotes] in a row unless there is a [blank line]"
    +- between them.
     - "Nothing else counts as a [block quote](#block-quotes)."
     - "Here is a simple example:"
     - "````````````````"
    @@ -2837,8 +2852,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "baz

    \n" - "````````````````" - "````````````````" -- "The `>` characters can be preceded by up to three spaces of" -- "indentation:" +- "The `>`" +- "characters can be preceded by up to three spaces of indentation:" - "````````````````" - "````````````````" - example @@ -2857,8 +2872,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- "The Laziness clause allows us to omit the `>` before" -- "[paragraph continuation text]:" +- "The Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:" - "````````````````" - "````````````````" - example @@ -2901,8 +2916,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • bar
  • \n" - "````````````````" - "````````````````" -- "For the same reason, we can't omit the `> ` in" -- "front of\nsubsequent lines of an indented or fenced code block:" +- "For the same reason, we can't omit the `> `" +- " in front of\nsubsequent lines of an indented or fenced code block:" - "````````````````" - "````````````````" - example @@ -2932,8 +2947,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "To see why, note that in" - "```markdown\n> foo\n> - bar\n```" -- "the `- bar` is indented too far to start a list," -- "and can't" +- "the `- bar`" +- "is indented too far to start a list, and can't" - be an indented code block because indented code blocks cannot - "interrupt paragraphs, so it is [paragraph continuation text]." - "A block quote can be empty:" @@ -2967,9 +2982,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "(Most current Markdown implementations, including John Gruber's" -- "original `Markdown.pl`, will parse this example as a" -- single block quote +- "(Most current Markdown implementations, including John Gruber's\noriginal" +- "`Markdown.pl`" +- ", will parse this example as a single block quote" - with two paragraphs. But it seems better to allow the author to decide - whether two block quotes or one are wanted.) - "Consecutiveness means that if we put these block quotes together," @@ -3035,9 +3050,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    baz

    " - "````````````````" - "````````````````" -- It is a consequence of the Laziness rule that any number -- "of initial `>`" -- "s may be omitted on a continuation line of a\nnested block quote:" +- "It is a consequence of the Laziness rule that any number\nof initial" +- "`>`s may be omitted on a continuation line of a" +- "nested block quote:" - "````````````````" - "````````````````" - example @@ -3058,10 +3073,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- "When including an indented code block in a block quote," -- "remember that the [block quote marker] includes\nboth the `>`" -- and a following space of indentation. So *five spaces* -- " are needed\nafter the `>`:" +- "When including an indented code block in a block quote,\nremember that the" +- "[block quote marker] includes\nboth the `>`" +- and a following space of indentation. So *five spaces* are needed +- "after the `>`:" - "````````````````" - "````````````````" - example @@ -3072,38 +3087,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## List items" -- "A [list marker](@) is a" -- "[bullet list marker] or an [ordered list marker]." -- "A [bullet list marker](@)" -- "is a `-`, `+`, or `*` character." -- "An [ordered list marker](@)" -- "is a sequence of 1--9 arabic digits (`0-" -- "9`), followed by either a\n`.` character or a" -- "`)`" +- "A [list marker](@) is a\n[bullet list marker]" +- "or an [ordered list marker]." +- "A [bullet list marker](@)\nis a `-`," +- "`+`, or `*` character." +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`" +- "), followed by either a\n`.` character or a `)`" - character. (The reason for the length - limit is that with 10 digits we start seeing integer overflows - "in some browsers.)\n\nThe following rules define [list items]:" -- 1. **Basic case. -- "** If a sequence of lines *Ls* constitute a sequence of\n blocks" -- "*Bs* starting with a character other than a space or tab," -- "and *M* is\n a list marker of width *W*" -- followed by 1 ≤ *N* +- "1." +- "**Basic case.** If a sequence of lines *Ls*" +- " constitute a sequence of\n blocks *Bs*" +- "starting with a character other than a space or tab, and *M*" +- " is\n a list marker of width *W* followed by 1 ≤ *N*" - " ≤ 4 spaces of indentation,\n then the result of prepending" - "*M* and the following spaces to the first line\n of *Ls*" -- ", and indenting subsequent lines of *Ls* by *W +" -- "N* spaces, is a\n list item with *Bs*" +- ", and indenting subsequent lines of *Ls* by" +- "*W + N* spaces, is a\n list item with *Bs*" - as its contents. The type of the list item - (bullet or ordered) is determined by the type of its list marker. - "If the list item is ordered, then it is also assigned a start" - "number, based on the ordered list marker.\n\n Exceptions:" -- "1. When the first list item in a [list] interrupts" -- "a paragraph---that is, when it starts on a line that would" +- "1. When the first list item in a [list] interrupts\n a paragraph" +- "---that is, when it starts on a line that would" - "otherwise count as [paragraph continuation text]---then (a)" -- "the lines *Ls* must not begin with a blank line, and (" -- "b) if\n the list item is ordered, the start number must be 1." -- "2." -- "If any line is a [thematic break][thematic breaks] then" -- that line is not a list item. +- the lines *Ls* +- "must not begin with a blank line, and (b) if" +- "the list item is ordered, the start number must be 1." +- "2. If any line is a [thematic break][thematic breaks" +- "] then\n that line is not a list item." - "For example, let *Ls* be the lines" - "````````````````" - "````````````````" @@ -3116,8 +3130,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "And let *M* be the marker `1.`, and *N" -- "* = 2. Then rule #1 says" +- "And let *M* be the marker `1.`, and" +- "*N* = 2. Then rule #1 says" - "that the following is an ordered list item with start number 1," - "and the same contents as *Ls*:" - "````````````````" @@ -3193,14 +3207,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "Here `two` occurs in the same column as the list marker `1." -- "`,\nbut is actually contained in the list item, because there is" +- "Here `two` occurs in the same column as the list marker" +- "`1.`," +- "but is actually contained in the list item, because there is" - sufficient indentation after the last containing blockquote marker. -- The converse is also possible. -- "In the following example, the word `two`" +- "The converse is also possible. In the following example, the word" +- "`two`" - "occurs far to the right of the initial text of the list item," -- "`one`" -- ", but" +- "`one`, but" - "it is not considered part of the list item, because it is not" - "indented\nfar enough past the blockquote marker:" - "````````````````" @@ -3295,11 +3309,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    -1. not ok

    " - "````````````````" - "````````````````" -- 2. **Item starting with indented code. -- "** If a sequence of lines *Ls*\n constitute a sequence of blocks" +- 2. **Item starting with indented code.** +- " If a sequence of lines *Ls*\n constitute a sequence of blocks" - "*Bs* starting with an indented code\n block, and" -- "*M* is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending" +- "*M* is a list marker of width *W* followed by" +- "one space of indentation, then the result of prepending" - "*M* and the\n following space to the first line of *Ls*" - ", and indenting subsequent lines\n of *Ls* by" - "*W + 1* spaces, is a list item with *Bs*" @@ -3332,9 +3346,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- If the *first* block in the list item is an indented code -- "block,\nthen by rule #2, the contents must be preceded by" -- "*one* space of indentation\nafter the list marker:" +- If the *first* +- "block in the list item is an indented code block," +- "then by rule #2, the contents must be preceded by *one*" +- " space of indentation\nafter the list marker:" - "````````````````" - "````````````````" - example @@ -3402,15 +3417,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- 3. **Item starting with a blank line. -- "** If a sequence of lines *Ls*" -- "starting with a single [blank line] constitute a (possibly empty)" -- "sequence of blocks *Bs*, and *M* is a list marker" -- "of width *W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of" -- "*Ls* by *W + 1* spaces of indentation," -- "is a\n list item with *Bs*" -- as its contents. +- 3. **Item starting with a blank line.** +- " If a sequence of lines *Ls*\n starting with a single [blank line" +- "] constitute a (possibly empty)\n sequence of blocks *Bs*, and" +- "*M* is a list marker of width *W*," +- then the result of prepending *M* to the first line of +- "*Ls*, and\n preceding subsequent lines of *Ls* by" +- "*W + 1* spaces of indentation, is a\n list item with" +- "*Bs* as its contents." - "If a line is empty, then it need not be indented." - The type of the - list item (bullet or ordered) is determined by the type of its list @@ -3439,8 +3453,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - A list item can begin with at most one blank line. -- "In the following example, `foo`" -- " is not part of the list\nitem:" +- "In the following example, `foo` is not part of the list" +- "item:" - "````````````````" - "````````````````" - example @@ -3495,9 +3509,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*

    \n

    foo\n1.

    " - "````````````````" - "````````````````" -- 4. **Indentation. -- "** If a sequence of lines *Ls*" -- constitutes a list item +- 4. **Indentation.** If a sequence of lines +- "*Ls* constitutes a list item" - "according to rule #1, #2, or #3, then the result" - "of preceding each line\n of *Ls*" - by up to three spaces of indentation (the same for each line) @@ -3555,10 +3568,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- 5. **Laziness. -- "** If a string of lines *Ls* constitute a" -- "[list\n item](#list-items) with contents *Bs*" -- ", then the result of deleting" +- 5. **Laziness.** If a string of lines +- "*Ls* constitute a [list\n item](#list-items)" +- "with contents *Bs*, then the result of deleting" - some or all of the indentation from one or more lines in which the - "next character other than a space or tab after the indentation is\n [" - "paragraph continuation text] is a" @@ -3610,13 +3622,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items)" -- "." -- The rules for sublists follow from the general rules -- "[above][List items]" -- ". A sublist must be indented the same number" +- "6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items)." +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number" - of spaces of indentation a paragraph would need to be in order to be - "included\nin the list item." - "So, in this case we need two spaces indent:" @@ -3693,8 +3703,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "### Motivation" -- "John Gruber's Markdown spec says the following about list items" -- ":" +- "John Gruber'" +- "s Markdown spec says the following about list items:" - "1." - "\"List markers typically start at the left margin, but may be indented" - by up to three spaces. List markers must be followed by one or more @@ -3703,19 +3713,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"To make lists look nice, you can wrap items with hanging indents" - "....\n But if you don't want to, you don'" - "t have to.\"" -- "3. \"List items may consist of multiple paragraphs. Each subsequent" +- "3." +- "\"List items may consist of multiple paragraphs. Each subsequent" - paragraph in a list item must be indented by either 4 spaces or one - "tab.\"" - "4." - "\"It looks nice if you indent every line of the subsequent paragraphs," - "but here again, Markdown will allow you to be lazy.\"" - "5." -- "\"To put a blockquote within a list item, the" +- "\"" +- "To put a blockquote within a list item, the" - "blockquote's `>`" - "delimiters need to be indented.\"" - "6." -- "\"To put a code block within a list item, the code block needs to" -- "be\n indented twice — 8 spaces or two tabs.\"" +- "\"" +- "To put a code block within a list item, the code block needs to be" +- "indented twice — 8 spaces or two tabs.\"" - These rules specify that a paragraph under a list item must be indented - "four spaces (presumably, from the left margin, rather than the start of" - "the list marker, but this is not said), and that code under a" @@ -3724,14 +3737,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that a block quote must be indented, but not by how much;" - "however, the\nexample given has four spaces indentation. Although nothing is said" - "about other kinds of block-level content, it is certainly reasonable to" -- infer that *all* -- "block elements under a list item, including other" +- "infer that *all* block elements under a list item, including other" - "lists, must be indented four spaces. This principle has been called the" - "*four-space rule*." -- "The four-space rule is clear and principled, and if the reference" -- "implementation `Markdown.pl`" -- " had followed it, it probably would have\nbecome the standard. However," -- "`Markdown.pl`" +- "The four-space rule is clear and principled, and if the reference\nimplementation" +- "`Markdown.pl` had followed it, it probably would have" +- "become the standard. However, `Markdown.pl`" - allowed paragraphs and - "sublists to start with only two spaces indentation, at least on the" - "outer level. Worse, its behavior was inconsistent: a sublist of an" @@ -3740,17 +3751,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - implementations of Markdown have developed very different rules for - determining what comes under a list item. - "(Pandoc and python-Markdown," -- "for example, stuck with Gruber's syntax description and the four-" -- space +- "for example, stuck with Gruber'" +- s syntax description and the four-space - "rule, while discount, redcarpet, marked, PHP Markdown," -- "and others\nfollowed `Markdown.pl`" -- "'s behavior more closely.)" +- "and others\nfollowed `Markdown.pl`'" +- s behavior more closely.) - "Unfortunately, given the divergences between implementations, there" - is no way to give a spec for list items that will be guaranteed not - "to break any existing documents. However, the spec given here should" - correctly handle lists formatted with either the four-space rule or -- "the more forgiving `Markdown.pl` behavior, provided they" -- "are laid out\nin a way that is natural for a human to read." +- "the more forgiving `Markdown.pl`" +- "behavior, provided they are laid out" +- in a way that is natural for a human to read. - The strategy here is to let the width and indentation of the list marker - determine the indentation necessary for blocks to fall under the list - "item, rather than having a fixed and arbitrary number. The writer can" @@ -3765,14 +3777,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - unnatural. It is quite unintuitive that - "``` markdown\n- foo\n\n bar\n\n - baz\n```" - "should be parsed as two lists with an intervening paragraph," -- "``` html\n
      " -- "
    • foo
    • \n
    " -- "

    bar

    \n
      " +- "``` html" +- "
        \n
      • foo
      • " +- "
      \n

      bar

      \n
        " - "
      • baz
      • \n
      " - "```" - "as the four-space rule demands, rather than a single list," -- "``` html\n
        \n
      • " -- "

        foo

        " +- "``` html" +- "
          \n
        • \n

          foo

          " - "

          bar

          \n
            " - "
          • baz
          • \n
          " - "
        • \n
        \n```" @@ -3781,20 +3793,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Would it help to adopt a two-space rule? The problem is that such - "a rule, together with the rule allowing up to three spaces of indentation" - "for\nthe initial list marker, allows text that is indented" -- "*less than*" -- " the\noriginal list marker to be included in the list item. For example," +- "*less than* the" +- "original list marker to be included in the list item. For example," - "`Markdown.pl` parses" - "``` markdown\n - one\n\n two\n```" - "as a single list item, with `two` a continuation paragraph:" -- "``` html\n
          \n
        • " -- "

          one

          \n

          two

          " -- "
        • \n
        \n```\n\nand similarly" -- "``` markdown\n> - one\n>\n> two\n```\n\nas" -- "``` html\n
        \n
          " -- "
        • \n

          one

          " +- "``` html" +- "
            \n
          • \n

            one

            " - "

            two

            \n
          • " -- "
          \n
        \n```" -- This is extremely unintuitive. +- "
      \n```\n\nand similarly" +- "``` markdown\n> - one\n>\n> two\n```\n\nas" +- "``` html" +- "
      \n
        \n
      • " +- "

        one

        \n

        two

        " +- "
      • \n
      \n
      " +- "```\n\nThis is extremely unintuitive." - "Rather than requiring a fixed indent from the margin, we could require" - "a fixed indent (say, two spaces, or even one space) from" - the list marker (which @@ -3814,8 +3827,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown\n1. foo\n\n indented code\n```" - where the code is indented eight spaces. - "The spec above, by contrast, will" -- "parse this text as expected, since the code block's indentation" -- "is measured\nfrom the beginning of `foo`." +- "parse this text as expected, since the code block'" +- "s indentation is measured\nfrom the beginning of `foo`." - The one case that needs special treatment is a list item that *starts* - with indented code. - "How much indentation is required in that case, since\nwe don'" @@ -3827,29 +3840,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule in cases where the list marker plus its initial indentation - "takes four spaces (a common case), but diverge in other cases." - "## Lists" -- "A [list](@) is a sequence of one or more" -- "list items [of the same type]. The list items" +- "A [list](@) is a sequence of one or more\nlist items" +- "[of the same type]. The list items" - may be separated by any number of blank lines. - "Two list items are [of the same type](@)" - "if they begin with a [list marker] of the same type." - Two list markers are of the - "same type if (a) they are bullet list markers using the same character\n(" -- "`-`, `+`, or `*`) or (b" -- ") they are ordered list numbers with the same\ndelimiter (either" -- "`.` or `)`)." +- "`-`, `+`, or `*`" +- ) or (b) they are ordered list numbers with the same +- "delimiter (either `.` or `)`)." - "A list is an [ordered list](@)" - "if its constituent list items begin with\n[ordered list markers], and a" -- "[bullet list](@)" -- " if its constituent list\nitems begin with [bullet list markers]." -- "The [start number](@)" -- "of an [ordered list] is determined by the list number of" +- "[bullet list](@) if its constituent list\nitems begin with [" +- "bullet list markers]." +- "The [start number](@)\nof an [ordered list]" +- is determined by the list number of - its initial list item. The numbers of subsequent list items are - disregarded. - "A list is [loose](@) if any of its constituent" - "list items are separated by blank lines, or if any of its constituent" - list items directly contain two block-level elements with a blank line -- "between them. Otherwise a list is [tight](@)" -- "." +- "between them. Otherwise a list is [tight](@)." - (The difference in HTML output is that paragraphs in a loose list - "are\nwrapped in `

      `" - "tags, while paragraphs in a tight list are not.)" @@ -3885,32 +3897,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    • baz
    • \n
    " - "````````````````" - "````````````````" -- "`Markdown.pl` does not allow this, through fear of" -- "triggering a list\nvia a numeral in a hard-wrapped line:" -- "``` markdown\nThe number of windows in my house is\n14." +- "`Markdown.pl`" +- "does not allow this, through fear of triggering a list" +- "via a numeral in a hard-wrapped line:" +- "``` markdown" +- "The number of windows in my house is\n14." - "The number of doors is 6.\n```" -- "Oddly, though, `Markdown.pl` *does* allow" -- "a blockquote to\ninterrupt a paragraph, even though the same considerations might" -- apply. +- "Oddly, though, `Markdown.pl` *does*" +- allow a blockquote to +- "interrupt a paragraph, even though the same considerations might\napply." - "In CommonMark, we do allow lists to interrupt paragraphs, for" - "two reasons. First, it is natural and not uncommon for people" - "to start lists without blank lines:" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" -- "- a plane ticket\n```\n\nSecond, we are attracted to a" -- "> [principle of uniformity](@):" -- "> if a chunk of text has a certain\n>" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket" +- "```\n\nSecond, we are attracted to a" +- ">" +- "[principle of uniformity](@):\n>" +- "if a chunk of text has a certain\n>" - "meaning, it will continue to have the same meaning when put into a\n>" - container block (such as a list item or blockquote). - "(Indeed, the spec for [list items] and [block quotes]" -- "presupposes\nthis principle.) This principle implies that if" -- "``` markdown\n * I need to buy\n - new shoes\n - a coat" -- " - a plane ticket\n```" +- " presupposes\nthis principle.) This principle implies that if" +- "``` markdown" +- " * I need to buy\n - new shoes\n - a coat\n - a plane ticket" +- "```" - "is a list item containing a paragraph followed by a nested sublist," - as all Markdown implementations agree it is (though the paragraph - "may be rendered without `

    ` tags, since the list is \"" - "tight\"),\nthen" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" -- "- a plane ticket\n```" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat\n- a plane ticket" +- "```" - by itself should be a paragraph followed by a nested sublist. - Since it is well established Markdown practice to allow lists to - "interrupt paragraphs inside list items, the [principle of\nuniformity]" @@ -3921,7 +3939,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - even inside other list items.) - In order to solve the problem of unwanted lists in paragraphs with - "hard-wrapped numerals, we allow only lists starting with `1`" -- "to\ninterrupt paragraphs. Thus," +- " to\ninterrupt paragraphs. Thus," - "````````````````" - "````````````````" - example @@ -4036,9 +4054,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- e\n" - "````````````````" - "````````````````" -- "And here, `3. c` is treated as in indented code" -- "block,\nbecause it is indented four spaces and preceded by a" -- blank line. +- "And here, `3. c`" +- "is treated as in indented code block," +- "because it is indented four spaces and preceded by a\nblank line." - "````````````````" - "````````````````" - example @@ -4210,18 +4228,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">" - "````````````````" - "````````````````" -- "`hi` is parsed as code, leaving the backtick at the end" -- "as a literal\nbacktick." +- "`hi`" +- "is parsed as code, leaving the backtick at the end as a literal" +- backtick. - "## Code spans" - "A [backtick string](@)" - "is a string of one or more backtick characters (`` ` ``" - ") that is neither\npreceded nor followed by a backtick." -- "A [code span](@) begins with a backtick string and ends" -- "with\na backtick string of equal length. The contents of the code span are" +- "A [code span](@)" +- begins with a backtick string and ends with +- a backtick string of equal length. The contents of the code span are - "the characters between these two backtick strings, normalized in the\nfollowing ways:" - "- First, [line endings] are converted to [spaces]." - "- If the resulting string both begins *and* ends with a [space]" -- " character, but does not consist entirely of [space]\n characters, a single [" +- "character, but does not consist entirely of [space]\n characters, a single [" - "space] character is removed from the" - front and back. This allows you to include code that begins - "or ends with backtick characters, which must be separated by" @@ -4268,8 +4288,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    a

    " - "````````````````" - "````````````````" -- "Only [spaces], and not [unicode whitespace] in general" -- ", are\nstripped in this way:" +- "Only [spaces], and not [unicode whitespace]" +- " in general, are\nstripped in this way:" - "````````````````" - "````````````````" - example @@ -4310,8 +4330,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /p> - "````````````````" - "````````````````" -- Note that browsers will typically collapse consecutive spaces -- "when rendering ``" +- "Note that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:" - "code{white-space: pre-wrap;}" - Note that backslash escapes do not work in code spans. All backslashes @@ -4324,9 +4343,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /p> - "````````````````" - "````````````````" -- "Backslash escapes are never needed, because one can always choose a" -- "string of *n* backtick characters as delimiters, where the" -- "code does\nnot contain any strings of exactly *n* backtick characters." +- "Backslash escapes are never needed, because one can always choose a\nstring of" +- "*n* backtick characters as delimiters, where the code does" +- not contain any strings of exactly *n* backtick characters. - "````````````````" - "````````````````" - example @@ -4346,8 +4365,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Code span backticks have higher precedence than any other inline - constructs except HTML tags and autolinks. - "Thus, for example, this is" -- "not parsed as emphasized text, since the second `*` is part of" -- "a code\nspan:" +- "not parsed as emphasized text, since the second `*`" +- " is part of a code\nspan:" - "````````````````" - "````````````````" - example @@ -4432,23 +4451,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Emphasis and strong emphasis" -- "John Gruber's original [Markdown syntax" -- "description](https://daringfireball.net/projects/" -- "markdown/syntax#em) says:" -- "> Markdown treats asterisks (`*`) and" -- "underscores (`_`" -- ") as indicators of\n> emphasis. Text wrapped with one `*` or" -- "`_` will be wrapped with an HTML\n>" -- "`` tag; double `*`'s or `_" -- "`'s will be wrapped with an HTML ``" -- "> tag." +- "John Gruber's original" +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown" +- "/syntax#em) says:" +- ">" +- "Markdown treats asterisks (`*`" +- ") and underscores (`_`) as indicators of\n>" +- "emphasis. Text wrapped with one `*` or `_`" +- " will be wrapped with an HTML\n> ``" +- "tag; double `*`'s or `_`'" +- "s will be wrapped with an HTML ``\n>" +- tag. - "This is enough for most users, but these rules leave much undecided," - especially when it comes to nested emphasis. The original -- "`Markdown.pl` test suite makes it clear that triple `*" -- "**` and\n`___`" +- "`Markdown.pl` test suite makes it clear that triple" +- "`***` and\n`___`" - "delimiters can be used for strong emphasis, and most" - "implementations have also allowed the following patterns:" -- "``` markdown\n***strong emph***" +- "``` markdown" +- "***strong emph***" - "***strong** in emph*" - "***emph* in strong**" - "**in strong *emph***" @@ -4456,54 +4478,56 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The following patterns are less widely supported, but the intent" - is clear and they are useful (especially in contexts like bibliography - "entries):" -- "``` markdown\n*emph *with emph* in it*" +- "``` markdown" +- "*emph *with emph* in it*" - "**strong **with strong** in it**\n```" -- Many implementations have also restricted intraword emphasis to -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - "forms, to avoid unwanted emphasis in words containing" - internal underscores. (It is best practice to put these in code - "spans, but users often do not.)" -- "``` markdown\ninternal emphasis: foo*bar*baz" +- "``` markdown" +- "internal emphasis: foo*bar*baz" - "no emphasis: foo_bar_baz\n```" - "The rules given below capture all of these patterns, while allowing" - for efficient parsing strategies that do not backtrack. -- "First, some definitions." -- "A [delimiter run](@)" +- "First, some definitions. A [delimiter run](@)" - " is either\na sequence of one or more `*`" - " characters that is not preceded or\nfollowed by a non-backslash-escaped" - "`*` character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped" - "`_` character." -- "A [left-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not followed by [" +- "A [left-flanking delimiter run](@) is\na" +- "[delimiter run] that is (1) not followed by [" - "Unicode whitespace]," -- "and either (2a) not followed by a [Unicode" -- "punctuation character], or\n(2b) followed by a [" -- "Unicode punctuation character] and\npreceded by [" -- "Unicode whitespace] or a [Unicode punctuation" -- "character].\nFor purposes of this definition, the beginning and the end of" +- "and either (2a) not followed by a [" +- "Unicode punctuation character], or" +- "(2b) followed by a [Unicode punctuation character" +- "] and\npreceded by [Unicode whitespace] or a [" +- "Unicode punctuation character]." +- "For purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. -- "A [right-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not preceded by [" +- "A [right-flanking delimiter run](@) is\na" +- "[delimiter run] that is (1) not preceded by [" - "Unicode whitespace]," -- "and either (2a) not preceded by a [Unicode" -- "punctuation character], or\n(2b) preceded by a [" -- "Unicode punctuation character] and\nfollowed by [" -- "Unicode whitespace] or a [Unicode punctuation" -- "character].\nFor purposes of this definition, the beginning and the end of" +- "and either (2a) not preceded by a [" +- "Unicode punctuation character], or" +- "(2b) preceded by a [Unicode punctuation character" +- "] and\nfollowed by [Unicode whitespace] or a [" +- "Unicode punctuation character]." +- "For purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. - Here are some examples of delimiter runs. -- " - left-flanking but not right-flanking:\n\n ```" -- " ***abc\n _abc" +- "- left-flanking but not right-flanking:" +- " ```\n ***abc\n _abc" - "**\"abc\"\n _\"abc\"\n ```" -- " - right-flanking but not left-flanking:\n\n ```" -- " abc***\n abc_" +- "- right-flanking but not left-flanking:" +- " ```\n abc***\n abc_" - "\"abc\"**\n \"abc\"_\n ```" -- " - Both left and right-flanking:\n\n ```" -- " abc***def\n \"abc\"_\"def\"" -- "```" -- " - Neither left nor right-flanking:\n\n ```" -- " abc *** def\n a _ b\n ```" +- "- Both left and right-flanking:" +- " ```\n abc***def" +- "\"abc\"_\"def\"\n ```" +- "- Neither left nor right-flanking:" +- " ```\n abc *** def\n a _ b\n ```" - (The idea of distinguishing left-flanking and right-flanking - delimiter runs based on the character before and the character - "after comes from Roopesh Chander's" @@ -4516,47 +4540,53 @@ input_file: tests/inputs/markdown/commonmark_spec.md - and its rules for distinguishing left- and right-flanking runs - are a bit more complex than the ones given here.) - "The following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)" -- "iff (if and only if) it is part of a [left-" -- "flanking delimiter run]." -- "2. A single `_` character [can open emphasis] iff" +- "1." +- "A single `*` character [can open emphasis](@)" +- "iff (if and only if) it is part of a [" +- "left-flanking delimiter run]." +- "2." +- "A single `_` character [can open emphasis] iff" - "it is part of a [left-flanking delimiter run]" -- "and either (a) not part of a [right-flanking" -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a" +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character]." -- "3. A single `*` character [can close emphasis](@)" +- "3." +- "A single `*` character [can close emphasis](@)" - "iff it is part of a [right-flanking delimiter run" - "]." -- "4. A single `_` character [can close emphasis] iff" +- "4." +- "A single `_` character [can close emphasis] iff" - "it is part of a [right-flanking delimiter run]" -- "and either (a) not part of a [left-flanking" -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a" +- "[left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character]." - "5." - "A double `**` [can open strong emphasis](@)" - "iff it is part of a [left-flanking delimiter run" - "]." -- "6. A double `__` [can open strong emphasis] iff" +- "6." +- "A double `__` [can open strong emphasis] iff" - "it is part of a [left-flanking delimiter run]" -- "and either (a) not part of a [right-flanking" -- "delimiter run]\n or (b) part of a [" -- "right-flanking delimiter run]\n preceded by a [" +- "and either (a) not part of a [" +- "right-flanking delimiter run]\n or (b) part of a" +- "[right-flanking delimiter run]\n preceded by a [" - "Unicode punctuation character]." - "7." - "A double `**` [can close strong emphasis](@)" - "iff it is part of a [right-flanking delimiter run" - "]." -- "8. A double `__` [can close strong emphasis] iff" +- "8." +- "A double `__` [can close strong emphasis] iff" - "it is part of a [right-flanking delimiter run]" -- "and either (a) not part of a [left-flanking" -- "delimiter run]\n or (b) part of a [" -- "left-flanking delimiter run]\n followed by a [" +- "and either (a) not part of a [" +- "left-flanking delimiter run]\n or (b) part of a" +- "[left-flanking delimiter run]\n followed by a [" - "Unicode punctuation character]." - "9." -- "Emphasis begins with a delimiter that [can open emphasis] and" -- "ends\n with a delimiter that [can close emphasis]" +- "Emphasis begins with a delimiter that [can open emphasis]" +- " and ends\n with a delimiter that [can close emphasis]" - ", and that uses the same\n character (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs" @@ -4564,10 +4594,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "open and close emphasis, then the sum of the lengths of the" - delimiter runs containing the opening and closing delimiters - "must not be a multiple of 3 unless both lengths are\n multiples of 3." -- 10. Strong emphasis begins with a delimiter that -- " [can open strong emphasis] and ends with a delimiter that\n [" -- "can close strong emphasis], and that uses the same character\n (`_`" -- "or `*`" +- "10." +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis]" +- " and ends with a delimiter that\n [can close strong emphasis]" +- ", and that uses the same character\n (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs" - "]. If one of the delimiters can both open" @@ -4575,37 +4605,45 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the delimiter runs containing the opening and closing - delimiters must not be a multiple of 3 unless both lengths - are multiples of 3. -- "11. A literal `*` character cannot occur at the beginning or end of" -- "`*`-delimited emphasis or `**`-" -- "delimited strong emphasis, unless it\n is backslash-escaped." -- "12. A literal `_` character cannot occur at the beginning or end of" -- "`_`-delimited emphasis or `__`-" -- "delimited strong emphasis, unless it\n is backslash-escaped." +- "11." +- "A literal `*` character cannot occur at the beginning or end of" +- "`*`-delimited emphasis or `**`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." +- "12." +- "A literal `_` character cannot occur at the beginning or end of" +- "`_`-delimited emphasis or `__`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings," - "the following principles resolve ambiguity:" -- 13. The number of nestings should be minimized. -- "Thus, for example,\n an interpretation" -- "`...` is always preferred to" +- "13." +- "The number of nestings should be minimized. Thus, for example," +- "an interpretation `...`" +- is always preferred to - "`...`." - "14." -- "An interpretation `..." -- "` is always\n preferred to" +- An interpretation +- "`...` is always\n preferred to" - "`...`." -- "15. When two potential emphasis or strong emphasis spans overlap," +- "15." +- "When two potential emphasis or strong emphasis spans overlap," - so that the second begins before the first ends and ends after - "the first ends, the first takes precedence. Thus, for example," -- "`*foo _bar* baz_` is parsed as `" -- "foo _bar baz_` rather" -- "than `*foo bar* baz" -- "`." -- 16. When there are two potential emphasis or strong emphasis spans +- "`*foo _bar* baz_` is parsed as" +- "`foo _bar baz_`" +- " rather\n than" +- "`*foo bar* baz`" +- "." +- "16." +- When there are two potential emphasis or strong emphasis spans - "with the same closing delimiter, the shorter one (the one that" - "opens later) takes precedence. Thus, for example," -- "`**foo **bar baz**` is parsed" -- "as `**foo bar baz" -- "`\n rather than" +- "`**foo **bar baz**`" +- is parsed as +- "`**foo bar baz`" +- rather than - "`foo **bar baz`" - "." - "17." @@ -4627,8 +4665,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "This is not emphasis, because the opening `*` is followed by" -- "whitespace, and hence not part of a [left-flanking" -- "delimiter run]:" +- "whitespace, and hence not part of a [" +- "left-flanking delimiter run]:" - "````````````````" - "````````````````" - example @@ -4783,8 +4821,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "This is not emphasis, because the second `*` is" - preceded by punctuation and followed by an alphanumeric -- "(hence it is not part of a [right-flanking delimiter" -- "run]:" +- "(hence it is not part of a [" +- "right-flanking delimiter run]:" - "````````````````" - "````````````````" - example @@ -5234,9 +5272,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- When the lengths of the interior closing and opening -- delimiter runs are *both* -- " multiples of 3, though,\nthey can match to create emphasis:" +- "When the lengths of the interior closing and opening\ndelimiter runs are" +- "*both* multiples of 3, though," +- "they can match to create emphasis:" - "````````````````" - "````````````````" - example @@ -5460,8 +5498,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Note that when delimiters do not match evenly, Rule 11 determines" -- "that the excess literal `*`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `*` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````" - "````````````````" - example @@ -5566,8 +5604,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Note that when delimiters do not match evenly, Rule 12 determines" -- "that the excess literal `_`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `_` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````" - "````````````````" - example @@ -5800,73 +5838,81 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Links" -- "A link contains [link text] (the visible text), a [link" -- "destination]" +- "A link contains [link text] (the visible text), a [" +- "link destination]" - "(the URI that is the link destination), and optionally a [" - "link title].\nThere are two basic kinds of links in Markdown. In" - "[inline links] the" - "destination and title are given immediately after the link text. In\n[reference links]" - " the destination and title are defined elsewhere in\nthe document." - "A [link text](@) consists of a sequence of zero or more" -- "inline elements enclosed by square brackets (`[` and `]`)" -- ". The\nfollowing rules apply:" -- "- Links may not contain other links, at any level of nesting. If" +- "inline elements enclosed by square brackets (`[` and `]`" +- "). The\nfollowing rules apply:" +- "-" +- "Links may not contain other links, at any level of nesting. If" - multiple otherwise valid link definitions appear nested inside each - "other, the inner-most definition is used." -- "- Brackets are allowed in the [link text] only if (a" -- ) they +- "-" +- "Brackets are allowed in the [link text]" +- only if (a) they - are backslash-escaped or (b) they appear as a matched pair of - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket" - "`]`." -- "- Backtick [code spans], [autolinks], and raw" -- "[HTML tags] bind more tightly" +- "-" +- "Backtick [code spans], [autolinks], and raw [" +- "HTML tags] bind more tightly" - "than the brackets in link text. Thus, for example," -- "`` [foo`]` `` could not be a link text" -- ", since the second `]`\n is part of a code span." -- "- The brackets in link text bind more tightly than markers for" -- "[emphasis and strong emphasis]. Thus, for example," +- "`` [foo`]` ``" +- "could not be a link text, since the second `]`" +- is part of a code span. +- "-" +- "The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis" +- "]. Thus, for example," - "`*[foo*](url)` is a link." - "A [link destination](@) consists of either" -- "- a sequence of zero or more characters between an opening `<` and a" -- " closing `>` that contains no line endings or unescaped\n `<`" -- "or `>` characters, or" -- "- a nonempty sequence of characters that does not start with `<`" -- ",\n does not include [ASCII control characters][" -- "ASCII control character]\n or [space]" +- "-" +- "a sequence of zero or more characters between an opening `<` and a\n closing" +- "`>` that contains no line endings or unescaped\n `<` or" +- "`>` characters, or" +- "-" +- "a nonempty sequence of characters that does not start with `<`," +- "does not include [ASCII control characters][ASCII control character" +- "]\n or [space]" - "character, and includes parentheses only if (a) they are" - backslash-escaped or (b) they are part of a balanced pair of - unescaped parentheses. - (Implementations may impose limits on parentheses nesting to - "avoid performance issues, but at least three levels of nesting\n should be supported.)" - "A [link title](@) consists of either" -- "- a sequence of zero or more characters between straight double-quote" -- "characters (`\"`), including a `\"` character only if it" -- "is\n backslash-escaped, or" -- "- a sequence of zero or more characters between straight single-quote" -- "characters (`'`), including a `'` character only if it" -- "is\n backslash-escaped, or" -- "- a sequence of zero or more characters between matching parentheses" -- "(`(...)`), including a `(` or" +- "-" +- "a sequence of zero or more characters between straight double-quote\n characters (" +- "`\"`), including a `\"` character only if it is" +- "backslash-escaped, or" +- "-" +- "a sequence of zero or more characters between straight single-quote\n characters (" +- "`'`), including a `'` character only if it is" +- "backslash-escaped, or" +- "-" +- "a sequence of zero or more characters between matching parentheses\n (" +- "`(...)`), including a `(` or" - "`)` character only if it is\n backslash-escaped." -- "Although [link titles] may span multiple lines, they may not contain" -- "a [blank line]." +- "Although [link titles] may span multiple lines, they may not contain\na [" +- "blank line]." - "An [inline link](@) consists of a [link text]" -- "followed immediately\nby a left parenthesis `(`" -- ", an optional [link destination], an optional\n[link title]" -- ", and a right parenthesis `)`" -- "." +- " followed immediately\nby a left parenthesis `(`, an optional [link destination" +- "], an optional\n[link title], and a right parenthesis" +- "`)`." - "These four components may be separated by spaces, tabs, and up to one" - "line\nending.\nIf both [link destination] and [link title]" -- "are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending." -- "The link's text consists of the inlines contained" -- "in the [link text] (excluding the enclosing square brackets)" -- ".\nThe link'" +- "are present, they *must* be" +- "separated by spaces, tabs, and up to one line ending." +- "The link's text consists of the inlines contained\nin the [link text" +- "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing" -- "`<...>` if present, with backslash-escapes in" -- "effect as described\nabove. The link'" -- "s title consists of the link title, excluding its" +- "`<...>`" +- " if present, with backslash-escapes in effect as described\nabove. The link" +- "'s title consists of the link title, excluding its" - "enclosing delimiters, with backslash-escapes in effect" - "as described\nabove.\n\nHere is a simple inline link:" - "````````````````" @@ -6149,20 +6195,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - link

    - "````````````````" - "````````````````" -- "(Note: `Markdown.pl` did allow double quotes inside a" -- "double-quoted\ntitle, and its test suite included a test demonstrating this." +- "(Note: `Markdown.pl`" +- did allow double quotes inside a double-quoted +- "title, and its test suite included a test demonstrating this." - But it is hard to see a good rationale for the extra complexity this - "brings, since there are already many ways---backslash escaping," - "entity and numeric character references, or using a different" - quote type for the enclosing title---to write titles containing -- "double quotes. `Markdown.pl`" -- "'s handling of titles has a number" +- "double quotes. `Markdown.pl`'" +- s handling of titles has a number - "of other strange features. For example, it allows single-quoted" - "titles in inline links, but not reference links. And, in" - "reference links but not inline links, it allows a title to begin\nwith" - "`\"` and end with `)`." -- "`Markdown.pl`" -- 1.0.1 even allows +- "`Markdown.pl` 1.0.1 even allows" - "titles with no closing quotation mark, though 1.0.2b8" - "does not.\nIt seems preferable to adopt a simple, rational rule that works" - the same way in inline links and link reference definitions.) @@ -6332,14 +6378,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "There are three kinds of [reference link](@)s:" -- "[full](#full-reference-link), [collapsed](" -- "#collapsed-reference-link),\nand" +- "[full](#full-reference-link)," +- "[collapsed](#collapsed-reference-link),\nand" - "[shortcut](#shortcut-reference-link)." -- "A [full reference link](@)" -- "consists of a [link text] immediately followed by a [link label]\nthat" -- "[matches] a [link reference definition] elsewhere in the document." -- "A [link label](@) begins with a left bracket (`[" -- "`) and ends\nwith the first right bracket (`]`" +- "A [full reference link](@)\nconsists of a [link text]" +- " immediately followed by a [link label]\nthat [matches] a [" +- "link reference definition] elsewhere in the document." +- "A [link label](@) begins with a left bracket (" +- "`[`) and ends\nwith the first right bracket (`]`" - ) that is not backslash-escaped. - "Between these brackets there must be at least one character that is not a space," - "tab, or line ending." @@ -6349,8 +6395,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One label [matches](@)" - another just in case their normalized forms are equal. To normalize a - "label, strip off the opening and closing brackets,\nperform the" -- "*Unicode case fold*" -- ", strip leading and trailing" +- "*Unicode case fold*, strip leading and trailing" - "spaces, tabs, and line endings, and collapse consecutive internal" - "spaces, tabs, and line endings to a single space." - "If there are multiple\nmatching reference link definitions, the one that comes first in the" @@ -6367,8 +6412,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo

    " - "````````````````" - "````````````````" -- "The rules for the [link text] are the same as with" -- "[inline links]. Thus:" +- "The rules for the [link text] are the same as with\n[" +- "inline links]. Thus:" - "The link text may contain balanced brackets, but not unbalanced ones," - "unless they are escaped:" - "````````````````" @@ -6549,12 +6594,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]. If whitespace is allowed between the" - "link text and the link label, then in the following we will have" - "a single reference link, not two shortcut reference links, as\nintended:" -- "``` markdown\n[foo]\n[bar]" -- "[foo]: /url1" +- "``` markdown" +- "[foo]\n[bar]\n\n[foo]: /url1" - "[bar]: /url2\n```" - "(Note that [shortcut reference links] were introduced by Gruber" -- "himself in a beta version of `Markdown.pl`, but never" -- "included\nin the official syntax description. Without shortcut reference" +- "himself in a beta version of `Markdown.pl`" +- ", but never included\nin the official syntax description. Without shortcut reference" - "links, it is harmless to allow space between the link text and" - "link label; but once shortcut references are introduced, it is" - "too dangerous to allow this, as it frequently leads to" @@ -6630,8 +6675,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\\

    " - "````````````````" - "````````````````" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:" +- "A [link label]" +- "must contain at least one character that is not a space, tab, or" +- "line ending:" - "````````````````" - "````````````````" - example @@ -6648,14 +6694,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]: /uri

    " - "````````````````" - "````````````````" -- "A [collapsed reference link](@)" -- "consists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument, followed by the string `[]`" -- ".\nThe contents of the link label are parsed as inlines," +- "A [collapsed reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the" +- "document, followed by the string `[]`." +- "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link'" - "s URI and title are\nprovided by the matching reference link definition. Thus," -- "`[foo][]` is equivalent to `[foo]" -- "[foo]`." +- "`[foo][]` is equivalent to" +- "`[foo][foo]`." - "````````````````" - "````````````````" - example @@ -6696,15 +6742,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\"title\">foo\n[]

    " - "````````````````" - "````````````````" -- "A [shortcut reference link](@)" -- "consists of a [link label] that [matches] a\n[link reference definition" -- "] elsewhere in the\ndocument and is not followed by `[]`" -- or a link label. +- "A [shortcut reference link](@)\nconsists of a [link label" +- "] that [matches] a\n[link reference definition] elsewhere in the" +- "document and is not followed by `[]` or a link label." - "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link'" - "s URI and title\nare provided by the matching link reference definition.\nThus," -- "`[foo]` is equivalent to `[foo][]" -- "`." +- "`[foo]` is equivalent to" +- "`[foo][]`." - "````````````````" - "````````````````" - example @@ -6814,8 +6859,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo(not a link)

    - "````````````````" - "````````````````" -- "In the following case `[bar][baz]` is parsed" -- "as a reference,\n`[foo]` as normal text:" +- "In the following case `[bar][baz]`" +- " is parsed as a reference,\n`[foo]`" +- "as normal text:" - "````````````````" - "````````````````" - example @@ -6825,8 +6871,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url\">bar

    " - "````````````````" - "````````````````" -- "Here, though, `[foo][bar]` is parsed" -- "as a reference, since\n`[bar]` is defined:" +- "Here, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:" - "````````````````" - "````````````````" - example @@ -6838,9 +6884,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url1\">baz

    " - "````````````````" - "````````````````" -- "Here `[foo]` is not parsed as a shortcut reference" -- ", because it\nis followed by a link label (even though" -- "`[bar]` is not defined):" +- "Here `[foo]`" +- "is not parsed as a shortcut reference, because it" +- "is followed by a link label (even though `[bar]`" +- "is not defined):" - "````````````````" - "````````````````" - example @@ -6854,11 +6901,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Images" - "Syntax for images is like the syntax for links, with one" - "difference. Instead of [link text], we have an" -- "[image description](@)" -- ". The rules for this are the\nsame as for [link text]" -- ", except that (a) an\nimage description starts with `![`" -- "rather than `[`" -- ", and\n(b) an image description may contain links." +- "[image description](@). The rules for this are the" +- "same as for [link text], except that (a) an" +- "image description starts with `![` rather than `[`, and" +- (b) an image description may contain links. - An image description has inline elements - "as its contents. When an image is rendered to HTML," - "this is standardly used as the image's `alt` attribute." @@ -6903,8 +6949,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Though this spec is concerned with parsing, not rendering, it is" - "recommended that in rendering to HTML, only the plain string content\nof the" - "[image description] be used. Note that in" -- "the above example, the alt attribute's value is `foo bar" -- "`, not `foo\n[bar](/url)` or" +- "the above example, the alt attribute's value is" +- "`foo bar`, not" +- "`foo\n[bar](/url)` or" - "`foo bar" - "`. Only the plain string" - "content is rendered, without formatting." @@ -7066,8 +7113,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "=\"Foo\" title=\"title\" />

    " - "````````````````" - "````````````````" -- "If you just want a literal `!" -- "` followed by bracketed text, you can\nbackslash-escape the opening" +- "If you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening" - "`[`:" - "````````````````" - "````````````````" @@ -7077,8 +7124,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    ![foo]

    " - "````````````````" - "````````````````" -- "If you want a link after a literal `!" -- "`, backslash-escape the\n`!`:" +- "If you want a link after a literal `!`" +- ", backslash-escape the\n`!`:" - "````````````````" - "````````````````" - example @@ -7089,28 +7136,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Autolinks" -- "[Autolink](@)s are absolute URIs and email addresses" -- "inside\n`<` and `>`" +- "[Autolink](@)" +- "s are absolute URIs and email addresses inside\n`<` and" +- "`>`" - ". They are parsed as links, with the URL or email address" - as the link label. -- "A [URI autolink](@) consists of `<`," -- "followed by an\n[absolute URI] followed by `>`" +- "A [URI autolink](@) consists of `<`" +- ", followed by an\n[absolute URI] followed by `>`" - ". It is parsed as" -- "a link to the URI, with the URI as the link's" -- label. +- "a link to the URI, with the URI as the link'" +- s label. - "An [absolute URI](@)," - "for these purposes, consists of a [scheme] followed by a colon (" -- "`:`" -- ")\nfollowed by zero or more characters other than [ASCII control\ncharacters]" -- "[ASCII control character], [space], `<`," -- "and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded" +- "`:`)\nfollowed by zero or more characters other than [" +- "ASCII control\ncharacters][ASCII control character], [space" +- "], `<`, and `>`." +- "If the URI includes these characters, they must be percent-encoded" - "(e.g. `%20` for a space)." -- "For purposes of this spec, a [scheme](@) is any" -- "sequence\nof 2--32 characters beginning with an ASCII letter and followed" +- "For purposes of this spec, a [scheme](@)" +- " is any sequence\nof 2--" +- 32 characters beginning with an ASCII letter and followed - "by any combination of ASCII letters, digits, or the symbols plus\n(" -- "\"+\"), period (\".\"), or" -- "hyphen (\"-\")." +- "\"+\"), period (\".\"" +- "), or hyphen (\"-\")." - "Here are some valid autolinks:" - "````````````````" - "````````````````" @@ -7216,18 +7264,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - p> - "````````````````" - "````````````````" -- "An [email autolink](@)" -- "consists of `<`, followed by an [email address],\nfollowed by" -- "`>`" +- "An [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is" - "`mailto:` followed by the email address." - "An [email address](@)," - "for these purposes, is anything that matches\nthe" -- "[non-normative regex from the HTML5" -- "spec](https://" -- html.spec.whatwg.org/multipage/ -- "forms.html#e-mail-state-(type=email))" -- ":" +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org" +- "/multipage/forms.html#e-mail-state-(type" +- "=email)):" - "/^[a-zA-Z0-9.!" - "#$%&'*+/=?" - "^_`{|}~-]+@[a-zA" @@ -7314,8 +7360,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## Raw HTML" -- "Text between `<` and `>` that looks like an HTML" -- tag is parsed as a +- "Text between `<` and `>`" +- that looks like an HTML tag is parsed as a - raw HTML tag and will be rendered in HTML without escaping. - "Tag and attribute names are not limited to current HTML tags," - "so custom tags (and even, say, DocBook tags) may be" @@ -7323,11 +7369,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A [tag name](@) consists of an ASCII letter" - "followed by zero or more ASCII letters, digits, or" - "hyphens (`-`)." -- "An [attribute](@) consists of spaces, tabs, and up" -- "to one line ending,\nan [attribute name], and an optional\n[" -- "attribute value specification]." -- "An [attribute name](@)" -- "consists of an ASCII letter, `_`, or `:`" +- "An [attribute](@)" +- " consists of spaces, tabs, and up to one line ending,\nan [" +- "attribute name], and an optional\n[attribute value specification]." +- "An [attribute name](@)\nconsists of an ASCII letter," +- "`_`, or `:`" - ", followed by zero or more ASCII\nletters, digits, `_`" - ", `.`, `:`, or `-`" - ". (Note: This is the XML" @@ -7335,46 +7381,47 @@ input_file: tests/inputs/markdown/commonmark_spec.md - HTML5 is laxer.) - "An [attribute value specification](@)" - "consists of optional spaces, tabs, and up to one line ending,\na" -- "`=` character, optional spaces, tabs, and up to one line" -- "ending,\nand an [attribute value]." -- "An [attribute value](@)" -- "consists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value]" -- "." +- "`=`" +- "character, optional spaces, tabs, and up to one line ending," +- "and an [attribute value]." +- "An [attribute value](@)\nconsists of an [" +- "unquoted attribute value],\na [single-quoted attribute value]" +- ", or a [double-quoted attribute value]." - "An [unquoted attribute value](@)" - is a nonempty string of characters not - "including spaces, tabs, line endings, `\"`, `'`" -- ", `=`, `<`, `>`, or ``" -- "` ``." -- "A [single-quoted attribute value](@)" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`." -- "A [double-quoted attribute value](@)" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`." -- "An [open tag](@) consists of a `<` character," -- "a [tag name],\nzero or more [attributes]" +- ", `=`, `<`, `>`, or" +- "`` ` ``." +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final" +- "`'`." +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final" +- "`\"`." +- "An [open tag](@) consists of a `<`" +- " character, a [tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional" - "`/` character, and a `>` character." - "A [closing tag](@) consists of the string ``." -- "An [HTML comment](@) consists of ``, ``, or `<" -- "!--`, a string of\ncharacters not including the string" -- "`-->`, and `-->` (see the" +- "An [HTML comment](@) consists of" +- "``, ``" +- ", or ``, and" +- "`-->` (see the" - "[HTML spec](https://" - html.spec.whatwg.org/multipage/ - "parsing.html#markup-declaration-open-state))." -- "A [processing instruction](@)\nconsists of the string ``" +- "A [processing instruction](@)\nconsists of the string ``" - ", and the string\n`?>`." -- "A [declaration](@) consists of the string ``, and the character `>`." -- "A [CDATA section](@) consists of" -- "the string ``" - ", and the string `]]>`." - "An [HTML tag](@) consists of an [open tag" @@ -7579,9 +7626,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - A line ending (not in a code span or HTML tag) that - is preceded - by two or more spaces and does not occur at the end of a block -- "is parsed as a [hard line break](@)" -- " (rendered\nin HTML as a `
    `" -- "tag):" +- "is parsed as a [hard line break](@) (rendered" +- "in HTML as a `
    ` tag):" - "````````````````" - "````````````````" - example @@ -7589,8 +7635,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - baz

    - "````````````````" - "````````````````" -- "For a more visible alternative, a backslash before the" -- "[line ending] may be used instead of two or more spaces:" +- "For a more visible alternative, a backslash before the\n[line ending]" +- "may be used instead of two or more spaces:" - "````````````````" - "````````````````" - example @@ -7754,7 +7800,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In this appendix we describe some features of the parsing strategy - used in the CommonMark reference implementations. - "## Overview\n\nParsing has two phases:" -- "1. In the first phase, lines of input are consumed and the block" +- "1." +- "In the first phase, lines of input are consumed and the block" - "structure of the document---its division into paragraphs, block quotes," - "list items, and so on---" - is constructed. Text is assigned to these @@ -7768,17 +7815,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "At each point in processing, the document is represented as a tree of" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as" -- "**children**" -- ". These children" +- "**children**. These children" - "may, in turn, have other blocks as children." - "The last child of a block\nis normally considered **open**" - ", meaning that subsequent lines of input" - can alter its contents. (Blocks that are not open are -- "**closed**" -- ".)" +- "**closed**.)" - "Here, for example, is a possible document tree, with the open blocks" - "marked by arrows:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph" @@ -7789,32 +7835,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Each line that is processed has an effect on this tree. The line is - "analyzed and, depending on its contents, the document may be altered" - "in one or more of the following ways:" -- "1. One or more open blocks may be closed.\n2." -- "One or more new blocks may be created as children of the\n last open block." +- 1. One or more open blocks may be closed. +- 2. One or more new blocks may be created as children of the +- last open block. - 3. Text may be added to the last (deepest) open block remaining - on the tree. - "Once a line has been incorporated into the tree in this way," - "it can be discarded, so input can be read in a stream." - "For each line, we follow this procedure:" -- "1. First we iterate through the open blocks, starting with the" +- "1." +- "First we iterate through the open blocks, starting with the" - "root document, and descending through last children down to the last" - open block. Each block imposes a condition that the line must satisfy - "if the block is to remain open. For example, a block quote requires a" -- "`>`" -- character. A paragraph requires a non-blank line. +- "`>` character. A paragraph requires a non-blank line." - In this phase we may match all or just some of the open - blocks. - "But we cannot close unmatched blocks yet, because we may have a\n[" - "lazy continuation line]." -- "2. Next, after consuming the continuation markers for existing" +- "2." +- "Next, after consuming the continuation markers for existing" - "blocks, we look for new block starts (e.g. `>`" - for a block quote). - "If we encounter a new block start, we close any blocks unmatched" - in step 1 before creating the new block as a child of the last - matched container block. -- "3. Finally, we look at the remainder of the line (after block" -- "markers like `>`, list markers, and indentation have been consumed" -- ").\nThis is text that can be incorporated into the last open" +- "3." +- "Finally, we look at the remainder of the line (after block\nmarkers like" +- "`>`, list markers, and indentation have been consumed)." +- This is text that can be incorporated into the last open - "block (a paragraph, code block, heading, or raw HTML)" - "." - Setext headings are formed when we see a line of a paragraph @@ -7824,8 +7873,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one or more reference link definitions. Any remainder becomes a\nnormal paragraph." - We can see how this works by considering how the tree above is - "generated by four lines of Markdown:" -- "``` markdown\n> Lorem ipsum dolor" -- sit amet. +- "``` markdown" +- "> Lorem ipsum dolor\nsit amet." - "> - Qui *quodsi iracundia*" - "> - aliquando id\n```" - "At the outset, our document model is just" @@ -7833,39 +7882,42 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The first line of our text," - "``` markdown\n> Lorem ipsum dolor\n```" - "causes a `block_quote` block to be created as a child of our" -- "open `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" -- ". Then the text is added to the last open\nblock, the `paragraph`" -- ":" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "open `document` block, and a `paragraph` block as a child of" +- "the `block_quote`. Then the text is added to the last open" +- "block, the `paragraph`:" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\"\n```\n\nThe next line," - "``` markdown\nsit amet.\n```" -- "is a \"lazy continuation\" of the open `paragraph`, so it gets" -- "added\nto the paragraph's text:" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "is a \"lazy continuation\" of the open `paragraph`" +- ", so it gets added\nto the paragraph's text:" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "```\n\nThe third line," - "``` markdown" - "> - Qui *quodsi iracundia*" - "```" - "causes the `paragraph` block to be closed, and a new `list`" -- "block\nopened as a child of the `block_quote`. A" +- " block\nopened as a child of the `block_quote`. A" - "`list_item` is also\nadded as a child of the `list`" - ", and a `paragraph` as a child of\nthe `list_item`" - ". The text is then added to the new `paragraph`:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-" - ")\n -> list_item\n -> paragraph" - "\"Qui *quodsi iracundia*\"" - "```\n\nThe fourth line," - "``` markdown\n> - aliquando id\n```" -- "causes the `list_item` (and its child the `paragraph`)" -- "to be closed,\nand a new `list_item`" +- "causes the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" - "opened up as child of the `list`. A `paragraph`" - "is added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-" - ")\n list_item\n paragraph" @@ -7878,16 +7930,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - string contents of paragraphs and headings as inlines. At this - "point we have seen all the link reference definitions, so we can" - resolve reference links as we go. -- "``` tree\ndocument\n block_quote\n paragraph" -- " str \"Lorem ipsum dolor\"\n softbreak" -- "str \"sit amet.\"" +- "``` tree" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"" +- " softbreak\n str \"sit amet.\"" - list (type=bullet tight=true bullet_char=-) - " list_item\n paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item\n paragraph" - " str \"aliquando id\"\n```" - "Notice how the [line ending] in the first paragraph has" -- "been parsed as a `softbreak`, and the asterisks" -- "in the first list item\nhave become an `emph`." +- "been parsed as a `softbreak`" +- ", and the asterisks in the first list item\nhave become an" +- "`emph`." - "### An algorithm for parsing nested emphasis and links" - "By far the trickiest part of inline parsing is handling emphasis," - "strong emphasis, links, and images. This is done using the following\nalgorithm." @@ -7895,47 +7948,53 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- a run of `*` or `_` characters, or" - "- a `[` or `![`" - "we insert a text node with these symbols as its literal content, and we" -- "add a pointer to this text node to the [delimiter stack]" -- (@). +- add a pointer to this text node to the +- "[delimiter stack](@)." - "The [delimiter stack] is a doubly linked list. Each" - "element contains a pointer to a text node, plus information about" -- "- the type of delimiter (`[`, `![" -- "`, `*`, `_`)" +- "- the type of delimiter (`[`, `![`" +- ", `*`, `_`)" - "- the number of delimiters," -- "- whether the delimiter is \"active\" (all are active to start" -- "), and" +- "- whether the delimiter is \"active\"" +- "(all are active to start), and" - "- whether the delimiter is a potential opener, a potential closer," - or both (which depends on what sort of characters precede - and follow the delimiters). -- "When we hit a `]` character, we call the *look for link" -- "or image*\nprocedure (see below)." +- "When we hit a `]` character, we call the" +- "*look for link or image*\nprocedure (see below)." - "When we hit the end of the input, we call the *process emphasis*" -- "procedure (see below), with `stack_bottom` = NULL" -- "." +- "procedure (see below), with `stack_bottom`" +- "= NULL." - "#### *look for link or image*" - "Starting at the top of the delimiter stack, we look backwards" - "through the stack for an opening `[` or `![`" - delimiter. - "- If we don't find one, we return a literal text node `" - "]`." -- "- If we do find one, but it's not *active*," -- we remove the inactive +- "-" +- "If we do find one, but it's not *active*" +- ", we remove the inactive" - "delimiter from the stack, and return a literal text node `]`" - "." -- "- If we find one and it's active, then we parse ahead" -- to see if +- "-" +- "If we find one and it'" +- "s active, then we parse ahead to see if" - "we have an inline link/image, reference link/image, collapsed reference" - "link/image, or shortcut reference link/image." -- "+ If we don't, then we remove the opening delimiter from" -- "the\n delimiter stack and return a literal text node `]`." +- + +- "If we don't, then we remove the opening delimiter from the" +- "delimiter stack and return a literal text node `]`." - "+ If we do, then" -- "* We return a link or image node whose children are the inlines" +- "*" +- We return a link or image node whose children are the inlines - after the text node pointed to by the opening delimiter. -- "* We run *process emphasis* on these inlines, with the `[" -- "` opener\n as `stack_bottom`." +- "*" +- "We run *process emphasis* on these inlines, with the `[`" +- " opener\n as `stack_bottom`." - "* We remove the opening delimiter." -- "* If we have a link (and not an image), we also set" -- "all\n `[` delimiters before the opening delimiter to" +- "*" +- "If we have a link (and not an image), we also set all" +- "`[` delimiters before the opening delimiter to" - "*inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*" - "Parameter `stack_bottom` sets a lower bound to how far we" @@ -7943,45 +8002,55 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". If it is NULL, we can" - "go all the way to the bottom. Otherwise, we stop before\nvisiting" - "`stack_bottom`." -- "Let `current_position` point to the element on the [delimiter" -- "stack]\njust above `stack_bottom` (or the first element if" -- "`stack_bottom`\nis NULL)." +- "Let `current_position` point to the element on the [" +- "delimiter stack]\njust above `stack_bottom`" +- "(or the first element if `stack_bottom`" +- is NULL). - "We keep track of the `openers_bottom` for each delimiter" -- "type (`*`, `_`), indexed to the length" -- of the closing delimiter run +- "type (`*`, `_`" +- "), indexed to the length of the closing delimiter run" - (modulo 3) and to whether the closing delimiter can also - "be an\nopener. Initialize this to `stack_bottom`." - "Then we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if" -- "needed)\n until we find the first potential closer with delimiter `*`" -- "or `_`" -- ".\n (This will be the potential closer closest\n to the beginning of the input" -- "-- the first one in parse order.)" -- "- Now, look back in the stack (staying above `stack_bottom`" -- "and\n the `openers_bottom`" -- " for this delimiter type) for the\n first matching potential opener (\"matching" -- "\" means same delimiter).\n\n- If one is found:" -- "+ Figure out whether we have emphasis or strong emphasis:" +- "-" +- "Move `current_position`" +- forward in the delimiter stack (if needed) +- "until we find the first potential closer with delimiter `*` or" +- "`_`.\n (This will be the potential closer closest" +- to the beginning of the input -- +- the first one in parse order.) +- "-" +- "Now, look back in the stack (staying above `stack_bottom` and" +- "the `openers_bottom` for this delimiter type) for the" +- "first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:" +- + +- "Figure out whether we have emphasis or strong emphasis:" - "if both closer and opener spans have length >= 2, we have" - "strong, otherwise regular." -- "+ Insert an emph or strong emph node accordingly, after" +- + +- "Insert an emph or strong emph node accordingly, after" - the text node corresponding to the opener. -- + Remove any delimiters between the opener and closer from +- + +- Remove any delimiters between the opener and closer from - the delimiter stack. -- + Remove 1 (for regular emph) or 2 (for strong +- + +- Remove 1 (for regular emph) or 2 (for strong - emph) delimiters - from the opening and closing text nodes. If they become empty - "as a result, remove them and remove the corresponding element" - "of the delimiter stack. If the closing node is removed, reset" - "`current_position` to the next element in the stack." - "- If none is found:" -- "+ Set `openers_bottom` to the element before `current_position" -- "`." +- + +- "Set `openers_bottom` to the element before `current_position`" +- "." - (We know that there are no openers for this kind of closer up to - "and\n including this point, so this puts a lower bound on future searches.)" -- "+ If the closer at `current_position` is not a potential opener," +- + +- "If the closer at `current_position` is not a potential opener," - "remove it from the delimiter stack (since we know it can't" - be a closer either). - "+ Advance `current_position` to the next element in the stack." -- "After we're done, we remove all delimiters above `" -- "stack_bottom` from the\ndelimiter stack." +- "After we're done, we remove all delimiters above" +- "`stack_bottom` from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap index 9e03da0..b5da2d9 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md-2.snap @@ -6,30 +6,35 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" +- "```" +- "1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item." - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------" - "# Task lists\n\n```\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [x] this is a complete item\n- [ ] this is an incomplete item\n```\n\n- [x] Finish my changes\n- [ ] Push my commits to GitHub\n- [ ] Open a pull request\n- [x] @mentions, #refs, [links](), **formatting**, and tags supported\n- [x] list syntax required (any unordered or ordered list supported)\n- [ ] this is a complete item\n- [ ] this is an incomplete item\n\n------" - "# Ignoring Markdown formatting\n\nYou can tell GitHub to ignore (or escape) Markdown formatting by using \\ before the Markdown character.\n\n```\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n```\n\nLet's rename \\*our-new-project\\* to \\*our-old-project\\*.\n\n------" - "# Links" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org" -- "[link text itself]: http://www.reddit.com\n```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later." +- "```" +- "[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later.\n\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com" +- "```\n\n[I'm an inline-style link](https://www.google.com)\n\n[I'm an inline-style link with title](https://www.google.com \"Google's Homepage\")\n\n[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n[I'm a relative reference to a repository file](../blob/master/LICENSE)\n\n[You can use numbers for reference-style link definitions][1]\n\nOr leave it empty and use the [link text itself].\n\nURLs and URLs in angle brackets will automatically get turned into links.\nhttp://www.example.com or and sometimes\nexample.com (but not on Github, for example).\n\nSome text to show that the reference links can follow later." - "[arbitrary case-insensitive reference text]: https://www.mozilla.org\n[1]: http://slashdot.org\n[link text itself]: http://www.reddit.com\n\n------" - "# Images" -- "```\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:" +- "```" +- "Here's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax\n\n![Alt text][id]\n\nWith a reference later in the document defining the URL location:" - "[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n```\n\nHere's our logo (hover to see the title text):\n\nInline-style:\n![alt text](https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n![alt text][logo]\n\n[logo]: https://github.com/adam-p/markdown-here/raw/master/src/common/images/icon48.png \"Logo Title Text 2\"\n\n![Minion](https://octodex.github.com/images/minion.png)\n![Stormtroopocat](https://octodex.github.com/images/stormtroopocat.jpg \"The Stormtroopocat\")\n\nLike links, Images also have a footnote style syntax" - "![Alt text][id]\n\nWith a reference later in the document defining the URL location:\n\n[id]: https://octodex.github.com/images/dojocat.jpg \"The Dojocat\"\n\n------\n\n# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```" - "```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;" +- "```php" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;" - " $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" - "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |" -- "| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns." -- "| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |" -- "| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" +- "```" +- "Colons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |" +- "| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |" +- "There must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" +- "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote." - "> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n# Inline HTML\n\n```\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n```\n\n
    \n
    Definition list
    \n
    Is something people use sometimes.
    \n\n
    Markdown in HTML
    \n
    Does *not* work **very** well. Use HTML tags.
    \n
    \n\n------" - "# Horizontal Rules\n\n```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\nUnderscores\n\n------" diff --git a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap index ef6c06d..cbdd02a 100644 --- a/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__huggingface_markdown_trim@github_flavored.md.snap @@ -4,9 +4,9 @@ expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers" -- "```\n# h1 Heading 8-)" -- "## h2 Heading\n### h3 Heading" -- "#### h4 Heading" +- "```" +- "# h1 Heading 8-)\n## h2 Heading" +- "### h3 Heading\n#### h4 Heading" - "##### h5 Heading" - "###### h6 Heading" - "Alternatively, for H1 and H2, an underline-ish style" @@ -38,15 +38,16 @@ input_file: tests/inputs/markdown/github_flavored.md - _underscores_. - "Strong emphasis, aka bold, with **asterisks** or" - __underscores__. -- Combined emphasis with **asterisks and _underscores_* -- "*." -- Strikethrough uses two tildes. ~~Scratch this. -- "~~\n\n**This is bold text**" -- "__This is bold text__\n\n*This is italic text*" -- "_This is italic text_\n\n~~Strikethrough~~" -- "------" +- Combined emphasis with +- "**asterisks and _underscores_**." +- Strikethrough uses two tildes. +- ~~Scratch this.~~ +- "**This is bold text**\n\n__This is bold text__" +- "*This is italic text*\n\n_This is italic text_" +- "~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item" +- "```" +- "1. First ordered list item\n2. Another item" - "⋅⋅* Unordered sub-list.\n1." - "Actual numbers don't matter, just that it's a number" - "⋅⋅1. Ordered sub-list\n4. And another item." @@ -77,14 +78,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "+ Very easy!\n```" - 1. First ordered list item - "2. Another item\n⋅⋅* Unordered sub-list." -- "1." -- "Actual numbers don't matter, just that it's a number" -- "⋅⋅1. Ordered sub-list\n4. And another item." +- "1. Actual numbers don't matter, just that it'" +- "s a number\n⋅⋅1. Ordered sub-list" +- 4. And another item. - ⋅⋅⋅You can have properly indented paragraphs within list items - "." - "Notice the blank line above, and the leading spaces (at least one," -- "but we'll use three here to also align the raw Markdown)" -- "." +- "but we'" +- ll use three here to also align the raw Markdown). - "⋅⋅⋅To have a line break without a paragraph, you will need" - to use two trailing spaces.⋅⋅ - "⋅⋅⋅Note that this line is separate, but within the same paragraph" @@ -93,10 +94,12 @@ input_file: tests/inputs/markdown/github_flavored.md - where trailing spaces are not required.) - "* Unordered list can use asterisks\n- Or minuses" - + Or pluses -- "1. Make my changes\n 1. Fix bug\n 2." -- "Improve formatting\n - Make the headings bigger" +- 1. Make my changes +- 1. Fix bug +- " 2. Improve formatting\n - Make the headings bigger" - 2. Push my commits to GitHub -- "3. Open a pull request\n * Describe my changes" +- 3. Open a pull request +- "* Describe my changes" - " * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-" - "`, or `*`" @@ -107,7 +110,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "- Nulla volutpat aliquam velit" - "+ Very easy!\n\n------" - "# Task lists" -- "```\n- [x] Finish my changes" +- "```" +- "- [x] Finish my changes" - "- [ ] Push my commits to GitHub" - "- [ ] Open a pull request" - "- [x] @mentions, #refs, [links]()" @@ -119,12 +123,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [x] Finish my changes" - "- [ ] Push my commits to GitHub" - "- [ ] Open a pull request" -- "- [x] @mentions, #refs, [links]()" -- ", **formatting**, and tags supported" -- "- [x] list syntax required (any unordered or ordered list supported" -- ")\n- [ ] this is a complete item" -- "- [ ] this is an incomplete item\n\n------" +- "-" +- "[x] @mentions, #refs, [links]()," +- "**formatting**, and tags" +- supported +- "-" +- "[x] list syntax required (any unordered or ordered list supported)" +- "- [ ] this is a complete item\n- [ ] this is an incomplete item" +- "------" - "# Ignoring Markdown formatting" - You can tell GitHub to ignore (or escape) Markdown - "formatting by using \\ before the Markdown character." @@ -170,8 +176,7 @@ input_file: tests/inputs/markdown/github_flavored.md - "Or leave it empty and use the [link text itself]." - URLs and URLs in angle brackets will automatically get turned into links - ".\nhttp://www.example.com or" -- "" -- and sometimes +- " and sometimes" - "example.com (but not on Github, for example)." - Some text to show that the reference links can follow later. - "[arbitrary case-insensitive reference text]: https://" @@ -204,32 +209,37 @@ input_file: tests/inputs/markdown/github_flavored.md - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"\n```" - "Here's our logo (hover to see the title text):" -- "Inline-style:\n![" -- "alt text](https://github.com/" -- adam-p/markdown-here/raw/master/src -- "/common/images/icon48.png \"Logo Title Text 1" -- "\")\n\nReference-style:\n![alt text][logo]" +- "Inline-style:" +- "![" +- alt text +- "](https://github.com/adam-p" +- /markdown-here/raw/master/src/common/images +- "/icon48.png \"Logo Title Text 1\")" +- "Reference-style:\n![alt text][logo]" - "[logo]: https://github.com/adam" - "-p/markdown-here/raw/master/src/common" - "/images/icon48.png \"Logo Title Text 2\"" - "![" -- "Minion](https://" -- octodex.github.com/images/ -- minion.png) +- Minion +- "](https://octodex.github.com" +- /images/minion.png) - "![" -- "Stormtroopocat](https://" -- octodex.github.com/images/ -- "stormtroopocat.jpg \"The Stormtroopocat" -- "\")\n\nLike links, Images also have a footnote style syntax" +- Stormtroopocat +- "](https://octodex.github.com" +- "/images/stormtroopocat.jpg \"The" +- "Stormtroopocat\")" +- "Like links, Images also have a footnote style syntax" - "![Alt text][id]" - "With a reference later in the document defining the URL location:" - "[id]: https://" - octodex.github.com/images/ - "dojocat.jpg \"The Dojocat\"" - "------" -- "# [Footnotes](https://github.com/" +- "#" +- "[Footnotes](https://github.com/" - markdown-it/markdown-it-footnote) -- "```\nFootnote 1 link[^first]." +- "```" +- "Footnote 1 link[^first]." - "Footnote 2 link[^second]." - "Inline footnote^[Text of inline footnote] definition." - "Duplicated footnote reference[^second]." @@ -247,7 +257,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Inline `code` has `back-ticks around` it." - "```" - "Inline `code` has `back-ticks around` it." -- "```c#\nusing System.IO.Compression;" +- "```c#" +- using System.IO.Compression; - "#pragma warning disable 414, 3021" - "namespace MyApplication\n{" - "[Obsolete(\"...\")]" @@ -258,7 +269,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "\");" - "return new List(new int[] { 1," - "2, 3 })\n }\n }\n}\n```" -- "```css\n@font-face {" +- "```css" +- "@font-face {" - "font-family: Chunkfive; src: url('" - "Chunkfive.otf');\n}" - "body, .usertext {" @@ -309,7 +321,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "__halt_compiler () ; datahere\ndatahere\ndatahere */" - "datahere\n```\n\n------" - "# Tables" -- "```\nColons can be used to align columns." +- "```" +- Colons can be used to align columns. - "| Tables | Are | Cool |" - "| ------------- |:" - "-------------:| -" @@ -341,7 +354,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n| --- | --- |" - "| Backtick | ` |\n| Pipe | \\| |\n```" - Colons can be used to align columns. -- "| Tables | Are | Cool |" +- "| Tables | Are | Cool" +- "|" - "| ------------- |:" - "-------------:| -" - "----:|" @@ -349,15 +363,16 @@ input_file: tests/inputs/markdown/github_flavored.md - "| col 2 is | centered | $12 |" - "| zebra stripes | are neat | $1 |" - There must be at least 3 dashes separating each header cell. -- "The outer pipes (|) are optional, and you don't need to" -- make the +- "The outer pipes (|) are optional, and you don'" +- t need to make the - raw Markdown line up prettily. - You can also use inline Markdown. - Markdown | Less | Pretty - "--- | --- | ---" - "*Still* | `renders` | **nicely**" - 1 | 2 | 3 -- "| First Header | Second Header |" +- "| First Header | Second Header" +- "|" - "| ------------- | -" - "------------ |" - "| Content Cell | Content Cell |\n| Content Cell | Content Cell |" @@ -366,9 +381,10 @@ input_file: tests/inputs/markdown/github_flavored.md - "| git diff | Show file differences that haven't been staged |" - "| Command | Description |\n| --- | --- |" - "| `git status` | List all *new or modified* files |" -- "| `git diff` | Show file differences that **" -- "haven't been** staged |" -- "| Left-aligned | Center-aligned | Right-aligned |" +- "| `git diff` |" +- "Show file differences that **haven't been** staged |" +- "| Left-aligned | Center-aligned | Right-aligned" +- "|" - "| :--- | :---: | ---:" - "|\n| git status | git status | git status |" - "| git diff | git diff | git diff |" @@ -389,19 +405,23 @@ input_file: tests/inputs/markdown/github_flavored.md - ">> ...by using additional greater-than signs right next to each" - "other...\n> > > ...or with spaces between arrows." - "```" -- "> Blockquotes are very handy in email to emulate reply text." -- "> This line is part of the same quote.\n\nQuote break." -- "> This is a very long line that will still be quoted properly when it wraps" -- ". Oh boy let'" +- ">" +- "Blockquotes are very handy in email to emulate reply text.\n>" +- "This line is part of the same quote.\n\nQuote break." +- ">" +- This is a very long line that will still be quoted properly when it wraps. +- "Oh boy let'" - s keep writing to make sure this is long enough to actually wrap for everyone. -- "Oh, you can *put* **Markdown** into a" -- blockquote. -- "> Blockquotes can also be nested..." -- ">> ...by using additional greater-than signs right next to each" -- "other...\n> > > ...or with spaces between arrows." +- "Oh, you can *put* **Markdown**" +- into a blockquote. +- "> Blockquotes can also be nested...\n>" +- ">" +- "...by using additional greater-than signs right next to each other" +- "...\n> > > ...or with spaces between arrows." - "------" - "# Inline HTML" -- "```\n
    " +- "```" +- "
    " - "
    Definition list
    " - "
    Is something people use sometimes.
    " - "
    Markdown in HTML
    " @@ -416,8 +436,9 @@ input_file: tests/inputs/markdown/github_flavored.md - Use HTML tags.\n
    \n\n------" - "# Horizontal Rules" -- "```\nThree or more...\n\n---\n\nHyphens" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n```" +- "```" +- "Three or more...\n\n---\n\nHyphens\n\n***" +- "Asterisks\n\n___\n\nUnderscores\n```" - "Three or more...\n\n---\n\nHyphens\n\n***" - "Asterisks\n\n___\n\nUnderscores\n\n------" - "# YouTube Videos" @@ -444,7 +465,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "jpg\" alt=\"IMAGE ALT TEXT" - "HERE\" width=\"240\" height=\"180\" border=" - "\"10\">\n" -- "```\n[![" +- "```" +- "[![" - "IMAGE ALT TEXT HERE](http:/" - /img.youtube.com/vi/ - YOUTUBE_VIDEO_ID_HERE/0. @@ -453,10 +475,10 @@ input_file: tests/inputs/markdown/github_flavored.md - v=YOUTUBE_VIDEO_ID_HERE) - "```" - "[![" -- "IMAGE ALT TEXT HERE](https:/" -- /upload.wikimedia.org/wikipedia/ -- commons/thumb/e/ef/YouTube_logo_2015. -- svg/1200px-YouTube_logo_2015. -- "svg.png)](https://" -- www.youtube.com/watch? +- IMAGE ALT TEXT HERE +- "](https://upload.wikimedia.org/" +- wikipedia/commons/thumb/e/ef/ +- YouTube_logo_2015.svg/1200px- +- "YouTube_logo_2015.svg.png)](https" +- "://www.youtube.com/watch?" - v=ciawICBvQoE) diff --git a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap index 0987525..c48a1a8 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md-2.snap @@ -8,35 +8,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Introduction\n\n" - "## What is Markdown?\n\n" - "Markdown is a plain text format for writing structured documents,\nbased on conventions for indicating formatting in email\nand usenet posts. It was developed by John Gruber (with\nhelp from Aaron Swartz) and released in 2004 in the form of a\n" -- "[syntax description](https://daringfireball.net/projects/markdown/syntax)\nand a Perl script (`Markdown.pl`" -- ") for converting Markdown to\nHTML. In the next decade, dozens of implementations were\ndeveloped in many languages. Some extended the original\nMarkdown syntax with conventions for footnotes, tables, and\n" -- "other document elements. Some allowed Markdown documents to be\nrendered in formats other than HTML. Websites like Reddit,\nStackOverflow, and GitHub had millions of people using Markdown.\nAnd Markdown started to be used beyond the web, to author books,\n" -- "articles, slide shows, letters, and lecture notes.\n\nWhat distinguishes Markdown from many other lightweight markup\nsyntaxes, which are often easier to write, is its readability.\nAs Gruber writes:\n\n" -- "> The overriding design goal for Markdown's formatting syntax is\n> to make it as readable as possible. The idea is that a\n> Markdown-formatted document should be publishable as-is, as\n> plain text, without looking like it's been marked up with tags\n" -- "> or formatting instructions.\n> ()\n\n" +- "[syntax description](https://daringfireball.net/projects/markdown/syntax)\nand a Perl script (`Markdown.pl`) for converting Markdown to\nHTML. In the next decade, dozens of implementations were\ndeveloped in many languages. Some extended the original\n" +- "Markdown syntax with conventions for footnotes, tables, and\nother document elements. Some allowed Markdown documents to be\nrendered in formats other than HTML. Websites like Reddit,\nStackOverflow, and GitHub had millions of people using Markdown.\n" +- "And Markdown started to be used beyond the web, to author books,\narticles, slide shows, letters, and lecture notes.\n\n" +- "What distinguishes Markdown from many other lightweight markup\nsyntaxes, which are often easier to write, is its readability.\nAs Gruber writes:\n\n" +- "> " +- "The overriding design goal for Markdown's formatting syntax is\n> to make it as readable as possible. The idea is that a\n> Markdown-formatted document should be publishable as-is, as\n> plain text, without looking like it's been marked up with tags\n> " +- "or formatting instructions.\n> ()\n\n" - "The point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n" -- "\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. " +- "\n```\n" +- "1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. " - "List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\n" - "This paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:\n" -- "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n" +- "```\n" +- "1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n" - " This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n" - " This paragraph belongs to item two of the outer list.\n```\n\n" - "The AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document.\n\n" - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n" -- " they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See " +- "1. " +- "How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but " +- "`Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See " - "[this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n" -- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n" -- " also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken " -- "[in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n" +- "2. " +- "Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations" +- "\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n" - "3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n" -- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n" -- " Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```" +- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```" +- "\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```" - "\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n" - "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n" - "6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n" -- " - fum\n ```\n\n" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n" +- "\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n" - "8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n" - "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n" - "10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n" @@ -65,8 +69,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nA line containing no characters, or a line containing only spaces\n(`U+0020`) or tabs (`U+0009`), is called a [blank line](@).\n\nThe following definitions of character classes will be used in this spec:\n" - "\nA [Unicode whitespace character](@) is a character in the Unicode `Zs` general\ncategory, or a tab (`U+0009`), line feed (`U+000A`), form feed (`U+000C`), or\ncarriage return (`U+000D`).\n" - "\n[Unicode whitespace](@) is a sequence of one or more\n[Unicode whitespace characters].\n\nA [tab](@) is `U+0009`.\n\nA [space](@) is `U+0020`.\n\nAn [ASCII control character](@) is a character between `U+0000–1F` (both\nincluding) or `U+007F`.\n" -- "\nAn [ASCII punctuation character](@)\nis `!`, `\"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`,\n`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), \n`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040),\n`[`, `\\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), \n" -- "`{`, `|`, `}`, or `~` (U+007B–007E).\n\nA [Unicode punctuation character](@) is a character in the Unicode `P`\n(puncuation) or `S` (symbol) general categories.\n\n" +- "\nAn [ASCII punctuation character](@)\nis `!`, `\"`, `#`, `$`, `%`, `&`, `'`, `(`, `)`,\n`*`, `+`, `,`, `-`, `.`, `/` (U+0021–2F), \n`:`, `;`, `<`, `=`, `>`, `?`, `@` (U+003A–0040),\n`[`, `\\`, `]`, `^`, `_`, `` ` `` (U+005B–0060), \n`{`, `|`, `}`, or `~`" +- " (U+007B–007E).\n\nA [Unicode punctuation character](@) is a character in the Unicode `P`\n(puncuation) or `S` (symbol) general categories.\n\n" - "## Tabs\n\nTabs in lines are not expanded to [spaces]. However,\nin contexts where spaces help to define block structure,\ntabs behave as if they were replaced by spaces with a tab stop\nof 4 characters.\n" - "\nThus, for example, a tab can be used instead of four spaces\nin an indented code block. (Note, however, that internal\ntabs are passed through as literal tabs, not expanded to\nspaces.)\n" - "\n```````````````````````````````` example\n→foo→baz→→bim\n.\n

    foo→baz→→bim\n
    \n````````````````````````````````" @@ -87,9 +91,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n\\!\\\"\\#\\$\\%\\&\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\<\\=\\>\\?\\@\\[\\\\\\]\\^\\_\\`\\{\\|\\}\\~\n.\n

    !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~

    \n````````````````````````````````" - "\n\n\nBackslashes before other characters are treated as literal\nbackslashes:\n\n```````````````````````````````` example\n\\→\\A\\a\\ \\3\\φ\\«\n.\n

    \\→\\A\\a\\ \\3\\φ\\«

    \n````````````````````````````````" - "\n\n\nEscaped characters are treated as regular characters and do\nnot have their usual Markdown meanings:\n" -- "\n```````````````````````````````` example\n\\*not emphasized*\n\\
    not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\ö not a character entity\n.\n

    *not emphasized*\n" -- "<br/> not a tag\n[not a link](/foo)\n`not code`\n1. not a list\n* not a list\n# not a heading\n[foo]: /url "not a reference"\n&ouml; not a character entity

    \n````````````````````````````````\n\n\n" -- "If a backslash is itself escaped, the following character is not:\n\n```````````````````````````````` example\n\\\\*emphasis*\n.\n

    \\emphasis

    \n````````````````````````````````\n\n\nA backslash at the end of the line is a [hard line break]:\n" +- "\n```````````````````````````````` example\n" +- "\\*not emphasized*\n\\
    not a tag\n\\[not a link](/foo)\n\\`not code`\n1\\. not a list\n\\* not a list\n\\# not a heading\n\\[foo]: /url \"not a reference\"\n\\ö not a character entity\n.\n

    *not emphasized*\n<br/> not a tag\n[not a link](/foo)\n`not code`\n" +- "1. not a list\n* not a list\n# not a heading\n[foo]: /url "not a reference"\n&ouml; not a character entity

    \n````````````````````````````````\n\n\nIf a backslash is itself escaped, the following character is not:\n" +- "\n```````````````````````````````` example\n\\\\*emphasis*\n.\n

    \\emphasis

    \n````````````````````````````````\n\n\nA backslash at the end of the line is a [hard line break]:\n" - "\n```````````````````````````````` example\nfoo\\\nbar\n.\n

    foo
    \nbar

    \n````````````````````````````````\n\n\nBackslash escapes do not work in code blocks, code spans, autolinks, or\nraw HTML:\n" - "\n```````````````````````````````` example\n`` \\[\\` ``\n.\n

    \\[\\`

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n \\[\\]\n.\n
    \\[\\]\n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n~~~\n\\[\\]\n~~~\n.\n
    \\[\\]\n
    \n````````````````````````````````" @@ -99,15 +104,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n[foo]\n\n[foo]: /bar\\* \"ti\\*tle\"\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n``` foo\\+bar\nfoo\n```\n.\n
    foo\n
    \n````````````````````````````````\n\n\n" - "## Entity and numeric character references\n\nValid HTML entity references and numeric character references\ncan be used in place of the corresponding Unicode character,\nwith the following exceptions:\n\n" -- "- Entity and character references are not recognized in code\n blocks and code spans.\n\n- Entity and character references cannot stand in place of\n special characters that define structural elements in\n CommonMark. " -- "For example, although `*` can be used\n in place of a literal `*` character, `*` cannot replace\n `*` in emphasis delimiters, bullet list markers, or thematic\n breaks.\n\n" -- "Conforming CommonMark parsers need not store information about\nwhether a particular character was represented in the source\nusing a Unicode character or an entity reference.\n" +- "- Entity and character references are not recognized in code\n blocks and code spans.\n\n" +- "- " +- "Entity and character references cannot stand in place of\n special characters that define structural elements in\n CommonMark. For example, although `*` can be used\n in place of a literal `*` character, `*` cannot replace\n `*`" +- " in emphasis delimiters, bullet list markers, or thematic\n breaks.\n\nConforming CommonMark parsers need not store information about\nwhether a particular character was represented in the source\nusing a Unicode character or an entity reference.\n" - "\n[Entity references](@) consist of `&` + any of the valid\nHTML5 entity names + `;`. The\ndocument \nis used as an authoritative source for the valid entity\nreferences and their corresponding code points.\n" - "\n```````````````````````````````` example\n  & © Æ Ď\n¾ ℋ ⅆ\n∲ ≧̸\n.\n

      & © Æ Ď\n¾ ℋ ⅆ\n∲ ≧̸

    \n````````````````````````````````" -- "\n\n\n[Decimal numeric character\nreferences](@)\nconsist of `&#` + a string of 1--7 arabic digits + `;`. A\nnumeric character reference is parsed as the corresponding\nUnicode character. Invalid Unicode code points will be replaced by\n" -- "the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons,\nthe code point `U+0000` will also be replaced by `U+FFFD`.\n\n```````````````````````````````` example\n# Ӓ Ϡ �\n.\n

    # Ӓ Ϡ �

    \n````````````````````````````````" -- "\n\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#` +\neither `X` or `x` + a string of 1-6 hexadecimal digits + `;`.\nThey too are parsed as the corresponding Unicode character (this\n" -- "time specified with a hexadecimal numeral instead of decimal).\n\n```````````````````````````````` example\n" ആ ಫ\n.\n

    " ആ ಫ

    \n````````````````````````````````\n\n\nHere are some nonentities:\n" +- "\n\n\n[Decimal numeric character\nreferences](@)\nconsist of `&#` + a string of 1--7 arabic digits + `;`. A\nnumeric character reference is parsed as the corresponding\nUnicode character. Invalid Unicode code points will be replaced by\nthe REPLACEMENT CHARACTER (" +- "`U+FFFD`). For security reasons,\nthe code point `U+0000` will also be replaced by `U+FFFD`.\n\n```````````````````````````````` example\n# Ӓ Ϡ �\n.\n

    # Ӓ Ϡ �

    \n````````````````````````````````" +- "\n\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#` +\neither `X` or `x` + a string of 1-6 hexadecimal digits + `;`.\nThey too are parsed as the corresponding Unicode character (this\ntime specified with a hexadecimal numeral instead of decimal)." +- "\n\n```````````````````````````````` example\n" ആ ಫ\n.\n

    " ആ ಫ

    \n````````````````````````````````\n\n\nHere are some nonentities:\n" - "\n```````````````````````````````` example\n  &x; &#; &#x;\n�\n&#abcdef0;\n&ThisIsNotDefined; &hi?;\n.\n

    &nbsp &x; &#; &#x;\n&#87654321;\n&#abcdef0;\n&ThisIsNotDefined; &hi?;

    \n````````````````````````````````" - "\n\n\nAlthough HTML5 does accept some entity references\nwithout a trailing semicolon (such as `©`), these are not\nrecognized here, because it makes the grammar too ambiguous:\n" - "\n```````````````````````````````` example\n©\n.\n

    &copy

    \n````````````````````````````````\n\n\nStrings that are not on the list of HTML5 named entities are not\nrecognized as entity references either:\n" @@ -127,9 +133,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "headings and paragraphs) contain [inline](@) content---text,\nlinks, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\nIndicators of block structure always take precedence over indicators\nof inline structure. So, for example, the following is a list with\ntwo items, not a list with one item containing a code span:\n" - "\n```````````````````````````````` example\n- `one\n- two`\n.\n
      \n
    • `one
    • \n
    • two`
    • \n
    \n````````````````````````````````" -- "\n\n\nThis means that parsing can proceed in two steps: first, the block\nstructure of the document can be discerned; second, text lines inside\nparagraphs, headings, and other block constructs can be parsed for inline\nstructure. " -- "The second step requires information about link reference\ndefinitions that will be available only at the end of the first\nstep. Note that the first step requires processing lines in sequence,\nbut the second can be parallelized, since the inline parsing of" -- "\none block element does not affect the inline parsing of any other.\n\n" +- "\n\n\nThis means that parsing can proceed in two steps: first, the block\nstructure of the document can be discerned; second, text lines inside\nparagraphs, headings, and other block constructs can be parsed for inline\n" +- "structure. The second step requires information about link reference\ndefinitions that will be available only at the end of the first\nstep. Note that the first step requires processing lines in sequence,\n" +- "but the second can be parallelized, since the inline parsing of\none block element does not affect the inline parsing of any other.\n\n" - "## Container blocks and leaf blocks\n\nWe can divide blocks into two types:\n[container blocks](#container-blocks),\nwhich can contain other blocks, and [leaf blocks](#leaf-blocks),\nwhich cannot.\n\n" - "# Leaf blocks\n\nThis section describes the different kinds of leaf block that make up a\nMarkdown document.\n\n" - "## Thematic breaks\n\nA line consisting of optionally up to three spaces of indentation, followed by a\nsequence of three or more matching `-`, `_`, or `*` characters, each followed\noptionally by any number of spaces or tabs, forms a\n[thematic break](@).\n" @@ -150,8 +156,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n* Foo\n* * *\n* Bar\n.\n
      \n
    • Foo
    • \n
    \n
    \n
      \n
    • Bar
    • \n
    \n````````````````````````````````\n\n\nIf you want a thematic break in a list item, use a different bullet:\n" - "\n```````````````````````````````` example\n- Foo\n- * * *\n.\n
      \n
    • Foo
    • \n
    • \n
      \n
    • \n
    \n````````````````````````````````\n\n\n" - "## ATX headings\n\n" -- "An [ATX heading](@)\nconsists of a string of characters, parsed as inline content, between an\nopening sequence of 1--6 unescaped `#` characters and an optional\nclosing sequence of any number of unescaped `#` characters.\n" -- "The opening sequence of `#` characters must be followed by spaces or tabs, or\nby the end of line. The optional closing sequence of `#`s must be preceded by\nspaces or tabs and may be followed by spaces or tabs only. The opening\n`#`" +- "An [ATX heading](@)\nconsists of a string of characters, parsed as inline content, between an\nopening sequence of 1--6 unescaped `#` characters and an optional\nclosing sequence of any number of unescaped `#` characters.\nThe opening sequence of `#`" +- " characters must be followed by spaces or tabs, or\nby the end of line. The optional closing sequence of `#`s must be preceded by\nspaces or tabs and may be followed by spaces or tabs only. The opening\n`#`" - " character may be preceded by up to three spaces of indentation. The raw\ncontents of the heading are stripped of leading and trailing space or tabs\nbefore being parsed as inline content. The heading level is equal to the number\nof `#`" - " characters in the opening sequence.\n\nSimple headings:\n" - "\n```````````````````````````````` example\n# foo\n## foo\n### foo\n#### foo\n##### foo\n###### foo\n.\n

    foo

    \n

    foo

    \n

    foo

    \n

    foo

    \n
    foo
    \n
    foo
    \n````````````````````````````````\n\n\nMore than six `#` characters is not a heading:\n" @@ -235,16 +241,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A [code fence](@) is a sequence\nof at least three consecutive backtick characters (`` ` ``) or\ntildes (`~`). (Tildes and backticks cannot be mixed.)\nA [fenced code block](@)\nbegins with a code fence, preceded by up to three spaces of indentation.\n" - "\nThe line with the opening code fence may optionally contain some text\nfollowing the code fence; this is trimmed of leading and trailing\nspaces or tabs and called the [info string](@). If the [info string] comes\n" - "after a backtick fence, it may not contain any backtick\ncharacters. (The reason for this restriction is that otherwise\nsome inline code would be incorrectly interpreted as the\nbeginning of a fenced code block.)\n\n" -- "The content of the code block consists of all subsequent lines, until\na closing [code fence] of the same type as the code block\nbegan with (backticks or tildes), and with at least as many backticks\nor tildes as the opening code fence. " -- "If the leading code fence is\npreceded by N spaces of indentation, then up to N spaces of indentation are\nremoved from each line of the content (if present). (If a content line is not\n" +- "The content of the code block consists of all subsequent lines, until\na closing [code fence] of the same type as the code block\nbegan with (backticks or tildes), and with at least as many backticks\n" +- "or tildes as the opening code fence. If the leading code fence is\npreceded by N spaces of indentation, then up to N spaces of indentation are\nremoved from each line of the content (if present). (If a content line is not\n" - "indented, it is preserved unchanged. If it is indented N spaces or less, all\nof the indentation is removed.)\n\n" - "The closing code fence may be preceded by up to three spaces of indentation, and\nmay be followed only by spaces or tabs, which are ignored. If the end of the\ncontaining block (or document) is reached and no closing code fence\n" - "has been found, the code block contains all of the lines after the\nopening code fence until the end of the containing block (or\ndocument). (An alternative spec would require backtracking in the\n" - "event that a closing code fence is not found. But this makes parsing\nmuch less efficient, and there seems to be no real downside to the\nbehavior described here.)\n\n" - "A fenced code block may interrupt a paragraph, and does not require\na blank line either before or after.\n" -- "\nThe content of a code fence is treated as literal text, not parsed\nas inlines. The first word of the [info string] is typically used to\nspecify the language of the code sample, and rendered in the `class`\nattribute of the `code` tag. " -- "However, this spec does not mandate any\nparticular treatment of the [info string].\n\nHere is a simple example with backticks:\n\n```````````````````````````````` example\n```\n<\n >\n```\n.\n
    <\n >\n
    \n````````````````````````````````" -- "\n\n\nWith tildes:\n\n```````````````````````````````` example\n~~~\n<\n >\n~~~\n.\n
    <\n >\n
    \n````````````````````````````````\n\nFewer than three backticks is not enough:\n" +- "\nThe content of a code fence is treated as literal text, not parsed\nas inlines. The first word of the [info string] is typically used to\nspecify the language of the code sample, and rendered in the `class`\nattribute of the `code`" +- " tag. However, this spec does not mandate any\nparticular treatment of the [info string].\n\nHere is a simple example with backticks:\n" +- "\n```````````````````````````````` example\n```\n<\n >\n```\n.\n
    <\n >\n
    \n````````````````````````````````\n\n\nWith tildes:\n" +- "\n```````````````````````````````` example\n~~~\n<\n >\n~~~\n.\n
    <\n >\n
    \n````````````````````````````````\n\nFewer than three backticks is not enough:\n" - "\n```````````````````````````````` example\n``\nfoo\n``\n.\n

    foo

    \n````````````````````````````````\n\nThe closing code fence must use the same character as the opening\nfence:\n" - "\n```````````````````````````````` example\n```\naaa\n~~~\n```\n.\n
    aaa\n~~~\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n~~~\naaa\n```\n~~~\n.\n
    aaa\n```\n
    \n````````````````````````````````" - "\n\n\nThe closing code fence must be at least as long as the opening fence:\n\n```````````````````````````````` example\n````\naaa\n```\n``````\n.\n
    aaa\n```\n
    \n````````````````````````````````" @@ -275,20 +282,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n```\n``` aaa\n```\n.\n
    ``` aaa\n
    \n````````````````````````````````\n\n\n\n" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n" - "\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\n" -- "It ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line.\n\n" -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n" -- "`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n" +- "It ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [" +- "end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line.\n\n" +- "1. " +- "**Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, " +- "``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n" - "3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n" - "5. **Start condition:** line begins with the string\n``.\n\n" -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" - " line is followed by a [blank line].\n\n" -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n" +- "7. " +- "**Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n" - "**End condition:** line is followed by a [blank line].\n\n" -- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\n" -- "block** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\n" +- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock**" +- " that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\n" - "For instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n"
     - "\n```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n" @@ -319,12 +329,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n*foo*\n.\n

    foo

    \n````````````````````````````````" - "\n\n\nHTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks\n" - "end at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\nA pre tag (type 1):\n" -- "\n```````````````````````````````` example\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\n"
    -- "main = print $ parseTags tags\n
    \n

    okay

    \n````````````````````````````````\n\n\nA script tag (type 1):\n" -- "\n```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nA textarea tag (type 1):\n" -- "\n```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\nA style tag (type 1):\n" -- "\n```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

    okay

    \n````````````````````````````````" +- "\n```````````````````````````````` example\n" +- "
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \n

    okay

    \n" +- "````````````````````````````````\n\n\nA script tag (type 1):\n" +- "\n```````````````````````````````` example\n" +- "\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nA textarea tag (type 1):\n\n```````````````````````````````` example\n\n.\n\n````````````````````````````````" +- "\n\nA style tag (type 1):\n\n```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

    okay

    \n````````````````````````````````" - "\n\n\nIf there is no matching end tag, the block will end at the\nend of the document (or the enclosing [block quote][block quotes]\nor [list item][list items]):\n" - "\n```````````````````````````````` example\n\n\nfoo\n.\n\n\nfoo\n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n>
    \n> foo\n\nbar\n.\n
    \n
    \nfoo\n
    \n

    bar

    \n````````````````````````````````" @@ -335,16 +346,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\n\nA processing instruction (type 3):\n" - "\n```````````````````````````````` example\n';\n\n?>\nokay\n.\n';\n\n?>\n

    okay

    \n````````````````````````````````\n\n\nA declaration (type 4):\n" - "\n```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\n\nCDATA (type 5):\n" -- "\n```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:\n" +- "\n```````````````````````````````` example\n" +- "\nokay\n.\n\n

    okay

    \n" +- "````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:\n" - "\n```````````````````````````````` example\n \n\n \n.\n \n
    <!-- foo -->\n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n
    \n\n
    \n.\n
    \n
    <div>\n
    \n````````````````````````````````\n\n\nAn HTML block of types 1--6 can interrupt a paragraph, and need not be\npreceded by a blank line.\n" - "\n```````````````````````````````` example\nFoo\n
    \nbar\n
    \n.\n

    Foo

    \n
    \nbar\n
    \n````````````````````````````````" - "\n\n\nHowever, a following blank line is needed, except at the end of\na document, and except for blocks of types 1--5, [above][HTML\nblock]:\n" - "\n```````````````````````````````` example\n
    \nbar\n
    \n*foo*\n.\n
    \nbar\n
    \n*foo*\n````````````````````````````````\n\n\nHTML blocks of type 7 cannot interrupt a paragraph:\n" - "\n```````````````````````````````` example\nFoo\n\nbaz\n.\n

    Foo\n\nbaz

    \n````````````````````````````````\n\n\nThis rule differs from John Gruber's original Markdown syntax\nspecification, which says:\n\n" -- "> The only restrictions are that block-level HTML elements —\n> e.g. `
    `, ``, `
    `, `

    `, etc. — must be separated from\n> surrounding content by blank lines, and the start and end tags of the\n" -- "> block should not be indented with spaces or tabs.\n\nIn some ways Gruber's rule is more restrictive than the one given\nhere:\n\n" +- "> " +- "The only restrictions are that block-level HTML elements —\n> e.g. `

    `, `
    `, `
    `, `

    `, etc. — must be separated from\n> surrounding content by blank lines, and the start and end tags of the\n> block should not be indented with spaces or tabs.\n" +- "\nIn some ways Gruber's rule is more restrictive than the one given\nhere:\n\n" - "- It requires that an HTML block be preceded by a blank line.\n- It does not allow the start tag to be indented.\n- It requires a matching end tag, which it also does not allow to\n be indented.\n\n" - "Most Markdown implementations (including some of Gruber's own) do not\nrespect all of these restrictions.\n" - "\nThere is one respect, however, in which Gruber's rule is more liberal\nthan the one given here, since it allows blank lines to occur inside\nan HTML block. There are two reasons for disallowing them here.\n" @@ -362,8 +375,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Link reference definitions\n\n" - "A [link reference definition](@)\nconsists of a [link label], optionally preceded by up to three spaces of\nindentation, followed\nby a colon (`:`), optional spaces or tabs (including up to one\n[line ending]), a [link destination],\n" - "optional spaces or tabs (including up to one\n[line ending]), and an optional [link\ntitle], which if it is present must be separated\nfrom the [link destination] by spaces or tabs.\nNo further character may occur.\n\n" -- "A [link reference definition]\ndoes not correspond to a structural element of a document. Instead, it\ndefines a label which can be used in [reference links]\nand reference-style [images] elsewhere in the document. [Link\n" -- "reference definitions] can come either before or after the links that use\nthem.\n\n```````````````````````````````` example\n[foo]: /url \"title\"\n\n[foo]\n.\n

    foo

    \n````````````````````````````````" +- "A [link reference definition]\ndoes not correspond to a structural element of a document. Instead, it\ndefines a label which can be used in [reference links]\nand reference-style [images] elsewhere in the document. [Link\nreference definitions]" +- " can come either before or after the links that use\nthem.\n\n```````````````````````````````` example\n[foo]: /url \"title\"\n\n[foo]\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n [foo]: \n /url \n 'the title' \n\n[foo]\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[Foo*bar\\]]:my_(url) 'title (with parens)'\n\n[Foo*bar\\]]\n.\n

    Foo*bar]

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[Foo bar]:\n\n'title'\n\n[Foo bar]\n.\n

    Foo bar

    \n````````````````````````````````\n\n\nThe title may extend over multiple lines:\n" @@ -407,15 +420,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n \n\naaa\n \n\n# aaa\n\n \n.\n

    aaa

    \n

    aaa

    \n````````````````````````````````\n\n\n\n" - "# Container blocks\n\nA [container block](#container-blocks) is a block that has other\nblocks as its contents. There are two basic kinds of container blocks:\n[block quotes] and [list items].\n[Lists] are meta-containers for [list items].\n" - "\nWe define the syntax for container blocks recursively. The general\nform of the definition is:\n\n> If X is a sequence of blocks, then the result of\n> transforming X in such-and-such a way is a container of type Y\n> with these blocks as its content.\n" -- "\nSo, we explain what counts as a block quote or list item by explaining\nhow these can be *generated* from their contents. This should suffice\nto define the syntax, although it does not give a recipe for *parsing*\nthese constructions. " -- "(A recipe is provided below in the section entitled\n[A parsing strategy](#appendix-a-parsing-strategy).)\n\n" +- "\nSo, we explain what counts as a block quote or list item by explaining\nhow these can be *generated* from their contents. This should suffice\nto define the syntax, although it does not give a recipe for *parsing*\n" +- "these constructions. (A recipe is provided below in the section entitled\n[A parsing strategy](#appendix-a-parsing-strategy).)\n\n" - "## Block quotes\n\nA [block quote marker](@),\noptionally preceded by up to three spaces of indentation,\nconsists of (a) the character `>` together with a following space of\nindentation, or (b) a single character `>` not followed by a space of\nindentation.\n" - "\nThe following rules define [block quotes]:\n\n" -- "1. **Basic case.** If a string of lines *Ls* constitute a sequence\n of blocks *Bs*, then the result of prepending a [block quote\n marker] to the beginning of each line in *Ls*\n is a [block quote](#block-quotes) containing *Bs*.\n\n2. **Laziness." -- "** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*" -- ", then the result of deleting\n the initial [block quote marker] from one or\n more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs*" -- " as its content.\n [Paragraph continuation text](@) is text\n that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph.\n\n" -- "3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them.\n\n" +- "1. **Basic case.** If a string of lines *Ls* constitute a sequence\n of blocks *Bs*, then the result of prepending a [block quote\n marker] to the beginning of each line in *Ls*\n is a [block quote](#block-quotes) containing *Bs*.\n\n" +- "2. " +- "**Laziness.** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*, then the result of deleting\n the initial [block quote marker] from one or\n " +- "more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs* as its content.\n [Paragraph continuation text](@) is text\n " +- "that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph.\n\n3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them.\n\n" - "Nothing else counts as a [block quote](#block-quotes).\n\nHere is a simple example:\n\n```````````````````````````````` example\n> # Foo\n> bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "\n\n\nThe space or tab after the `>` characters can be omitted:\n\n```````````````````````````````` example\n># Foo\n>bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "\n\n\nThe `>` characters can be preceded by up to three spaces of indentation:\n\n```````````````````````````````` example\n > # Foo\n > bar\n > baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" @@ -449,16 +462,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n" - "\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n" - "\nThe following rules define [list items]:\n\n" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n" -- " then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs*" -- " as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n " -- "1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n" -- " the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" +- "1. " +- "**Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n " +- "then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n " +- "(bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n " +- "1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if" +- "\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n" - "\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n" - "````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n" -- "\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\n" +- "\n```````````````````````````````` example\n" +- "1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\n" - "The most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\n" - "marker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n\n" - "Here are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````" @@ -469,8 +484,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The spaces of indentation after the list marker determine how much relative\nindentation is needed. Which column this indentation reaches will depend on\nhow the list item is embedded in other constructions, as shown by\nthis example:\n\n" - "```````````````````````````````` example\n > > 1. one\n>>\n>> two\n.\n
    \n
    \n
      \n
    1. \n

      one

      \n

      two

      \n
    2. \n
    \n
    \n
    \n````````````````````````````````" - "\n\n\nHere `two` occurs in the same column as the list marker `1.`,\nbut is actually contained in the list item, because there is\nsufficient indentation after the last containing blockquote marker.\n" -- "\nThe converse is also possible. In the following example, the word `two`\noccurs far to the right of the initial text of the list item, `one`, but\nit is not considered part of the list item, because it is not indented\n" -- "far enough past the blockquote marker:\n\n```````````````````````````````` example\n>>- one\n>>\n > > two\n.\n
    \n
    \n
      \n
    • one
    • \n
    \n

    two

    \n
    \n
    \n````````````````````````````````" +- "\nThe converse is also possible. In the following example, the word `two`\noccurs far to the right of the initial text of the list item, `one`, but\nit is not considered part of the list item, because it is not indented\nfar enough past the blockquote marker:" +- "\n\n```````````````````````````````` example\n>>- one\n>>\n > > two\n.\n
    \n
    \n
      \n
    • one
    • \n
    \n

    two

    \n
    \n
    \n````````````````````````````````" - "\n\n\nNote that at least one space or tab is needed between the list marker and\nany following content, so these are not list items:\n\n```````````````````````````````` example\n-one\n\n2.two\n.\n

    -one

    \n

    2.two

    \n````````````````````````````````" - "\n\n\nA list item may contain blocks that are separated by more than\none blank line.\n\n```````````````````````````````` example\n- foo\n\n\n bar\n.\n
      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````" - "\n\n\nA list item may contain any kind of block:\n" @@ -481,9 +496,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nA start number may begin with 0s:\n\n```````````````````````````````` example\n0. ok\n.\n
      \n
    1. ok
    2. \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n003. ok\n.\n
      \n
    1. ok
    2. \n
    \n````````````````````````````````\n\n\nA start number may not be negative:\n" - "\n```````````````````````````````` example\n-1. not ok\n.\n

    -1. not ok

    \n````````````````````````````````\n\n\n\n" -- "2. **Item starting with indented code.** If a sequence of lines *Ls*\n constitute a sequence of blocks *Bs* starting with an indented code\n block, and *M* is a list marker of width *W* followed by\n" -- " one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n " +- "2. **Item starting with indented code.** If a sequence of lines *Ls*\n constitute a sequence of blocks *Bs* starting with an indented code\n block, and *M* is a list marker of width *W* followed by\n " +- "one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents.\n " +- "If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n " - "start number, based on the ordered list marker.\n\nAn indented code block will have to be preceded by four spaces of indentation\nbeyond the edge of the region where text will be included in the list item.\nIn the following case that is 6 spaces:\n" - "\n```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n
      bar\n
      \n
    • \n
    \n````````````````````````````````\n\n\nAnd in this case it is 11 spaces:\n" - "\n```````````````````````````````` example\n 10. foo\n\n bar\n.\n
      \n
    1. \n

      foo

      \n
      bar\n
      \n
    2. \n
    \n````````````````````````````````" @@ -492,15 +507,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n1. indented code\n\n paragraph\n\n more code\n.\n
      \n
    1. \n
      indented code\n
      \n

      paragraph

      \n
      more code\n
      \n
    2. \n
    \n````````````````````````````````" - "\n\n\nNote that an additional space of indentation is interpreted as space\ninside the code block:\n" - "\n```````````````````````````````` example\n1. indented code\n\n paragraph\n\n more code\n.\n
      \n
    1. \n
       indented code\n
      \n

      paragraph

      \n
      more code\n
      \n
    2. \n
    \n````````````````````````````````" -- "\n\n\nNote that rules #1 and #2 only apply to two cases: (a) cases\nin which the lines to be included in a list item begin with a\ncharacter other than a space or tab, and (b) cases in which\nthey begin with an indented code\nblock. " -- "In a case like the following, where the first block begins with\nthree spaces of indentation, the rules do not allow us to form a list item by\nindenting the whole thing and prepending a list marker:\n\n" +- "\n\n\nNote that rules #1 and #2 only apply to two cases: (a) cases\nin which the lines to be included in a list item begin with a\ncharacter other than a space or tab, and (b) cases in which\nthey begin with an indented code\n" +- "block. In a case like the following, where the first block begins with\nthree spaces of indentation, the rules do not allow us to form a list item by\nindenting the whole thing and prepending a list marker:\n\n" - "```````````````````````````````` example\n foo\n\nbar\n.\n

    foo

    \n

    bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • foo
    • \n
    \n

    bar

    \n````````````````````````````````" - "\n\n\nThis is not a significant restriction, because when a block is preceded by up to\nthree spaces of indentation, the indentation can always be removed without\na change in interpretation, allowing rule #1 to be applied. So, in\nthe above case:\n" - "\n```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````\n\n\n" -- "3. **Item starting with a blank line.** If a sequence of lines *Ls*\n starting with a single [blank line] constitute a (possibly empty)\n sequence of blocks *Bs*, and *M* is a list marker of width *W*,\n" -- " then the result of prepending *M* to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n " -- "start number, based on the ordered list marker.\n\nHere are some list items that start with a blank line but are not empty:\n" +- "3. **Item starting with a blank line.** If a sequence of lines *Ls*\n starting with a single [blank line] constitute a (possibly empty)\n sequence of blocks *Bs*, and *M* is a list marker of width *W*,\n then the result of prepending *M*" +- " to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs* as its contents.\n If a line is empty, then it need not be indented. The type of the\n " +- "list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n start number, based on the ordered list marker.\n\n" +- "Here are some list items that start with a blank line but are not empty:\n" - "\n```````````````````````````````` example\n-\n foo\n-\n ```\n bar\n ```\n-\n baz\n.\n
      \n
    • foo
    • \n
    • \n
      bar\n
      \n
    • \n
    • \n
      baz\n
      \n
    • \n
    \n````````````````````````````````" - "\n\nWhen the list item starts with a blank line, the number of spaces\nfollowing the list marker doesn't change the required indentation:\n\n```````````````````````````````` example\n- \n foo\n.\n
      \n
    • foo
    • \n
    \n````````````````````````````````" - "\n\n\nA list item can begin with at most one blank line.\nIn the following example, `foo` is not part of the list\nitem:\n\n```````````````````````````````` example\n-\n\n foo\n.\n
      \n
    • \n
    \n

    foo

    \n````````````````````````````````" @@ -509,22 +524,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\nHere is an empty ordered list item:\n\n```````````````````````````````` example\n1. foo\n2.\n3. bar\n.\n
      \n
    1. foo
    2. \n
    3. \n
    4. bar
    5. \n
    \n````````````````````````````````\n\n\nA list may start or end with an empty list item:\n" - "\n```````````````````````````````` example\n*\n.\n
      \n
    • \n
    \n````````````````````````````````\n\nHowever, an empty list item cannot interrupt a paragraph:\n" - "\n```````````````````````````````` example\nfoo\n*\n\nfoo\n1.\n.\n

    foo\n*

    \n

    foo\n1.

    \n````````````````````````````````\n\n\n" -- "4. **Indentation.** If a sequence of lines *Ls* constitutes a list item\n according to rule #1, #2, or #3, then the result of preceding each line\n of *Ls* by up to three spaces of indentation (the same for each line) also\n" -- " constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented two spaces:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented three spaces:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nFour spaces indent gives a code block:\n" +- "4. **Indentation.** If a sequence of lines *Ls* constitutes a list item\n according to rule #1, #2, or #3, then the result of preceding each line\n of *Ls* by up to three spaces of indentation (the same for each line) also\n " +- "constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nIndented two spaces:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nIndented three spaces:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nFour spaces indent gives a code block:\n" - "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
    1.  A paragraph\n    with two lines.\n\n        indented code\n\n    > A block quote.\n
    \n" - "````````````````````````````````\n\n\n\n" -- "5. **Laziness.** If a string of lines *Ls* constitute a [list\n item](#list-items) with contents *Bs*, then the result of deleting\n some or all of the indentation from one or more lines in which the\n" -- " next character other than a space or tab after the indentation is\n [paragraph continuation text] is a\n list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s.\n\n" +- "5. **Laziness.** If a string of lines *Ls* constitute a [list\n item](#list-items) with contents *Bs*, then the result of deleting\n some or all of the indentation from one or more lines in which the\n " +- "next character other than a space or tab after the indentation is\n [paragraph continuation text] is a\n list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s.\n\n" - "Here is an example with [lazy continuation lines]:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n" -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndentation can be partially deleted:\n" -- "\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````\n\n\nThese examples show how laziness can work in nested structures:\n" +- "\n```````````````````````````````` example\n" +- " 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n" +- "````````````````````````````````\n\n\nIndentation can be partially deleted:\n\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````" +- "\n\n\nThese examples show how laziness can work in nested structures:\n" - "\n```````````````````````````````` example\n> 1. > Blockquote\ncontinued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n> 1. > Blockquote\n> continued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````\n\n\n\n" - "6. **That's all.** Nothing that is not counted as a list item by rules\n #1--5 counts as a [list item](#list-items).\n\n" @@ -537,8 +556,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n1. - 2. foo\n.\n
      \n
    1. \n
        \n
      • \n
          \n
        1. foo
        2. \n
        \n
      • \n
      \n
    2. \n
    \n````````````````````````````````\n\n\nA list item can contain a heading:\n" - "\n```````````````````````````````` example\n- # Foo\n- Bar\n ---\n baz\n.\n
      \n
    • \n

      Foo

      \n
    • \n
    • \n

      Bar

      \nbaz
    • \n
    \n````````````````````````````````\n\n\n" - "### Motivation\n\nJohn Gruber's Markdown spec says the following about list items:\n\n" -- "1. \"List markers typically start at the left margin, but may be indented\n by up to three spaces. List markers must be followed by one or more\n spaces or a tab.\"\n\n2. \"To make lists look nice, you can wrap items with hanging indents....\n" -- " But if you don't want to, you don't have to.\"\n\n3. \"List items may consist of multiple paragraphs. Each subsequent\n paragraph in a list item must be indented by either 4 spaces or one\n tab.\"\n\n" +- "1. \"List markers typically start at the left margin, but may be indented\n by up to three spaces. List markers must be followed by one or more\n spaces or a tab.\"\n\n" +- "2. \"To make lists look nice, you can wrap items with hanging indents....\n But if you don't want to, you don't have to.\"\n\n" +- "3. \"List items may consist of multiple paragraphs. Each subsequent\n paragraph in a list item must be indented by either 4 spaces or one\n tab.\"\n\n" - "4. \"It looks nice if you indent every line of the subsequent paragraphs,\n but here again, Markdown will allow you to be lazy.\"\n\n5. \"To put a blockquote within a list item, the blockquote's `>`\n delimiters need to be indented.\"\n\n" - "6. \"To put a code block within a list item, the code block needs to be\n indented twice — 8 spaces or two tabs.\"\n\n" - "These rules specify that a paragraph under a list item must be indented\nfour spaces (presumably, from the left margin, rather than the start of\nthe list marker, but this is not said), and that code under a list item\n" @@ -558,8 +578,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nWould it help to adopt a two-space rule? The problem is that such\na rule, together with the rule allowing up to three spaces of indentation for\nthe initial list marker, allows text that is indented *less than* the\n" - "original list marker to be included in the list item. For example,\n`Markdown.pl` parses\n\n``` markdown\n - one\n\n two\n```\n\nas a single list item, with `two` a continuation paragraph:\n\n``` html\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n```\n\nand similarly\n" - "\n``` markdown\n> - one\n>\n> two\n```\n\nas\n\n``` html\n
    \n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n
    \n```\n\nThis is extremely unintuitive.\n" -- "\nRather than requiring a fixed indent from the margin, we could require\na fixed indent (say, two spaces, or even one space) from the list marker (which\nmay itself be indented). This proposal would remove the last anomaly\ndiscussed. " -- "Unlike the spec presented above, it would count the following\nas a list item with a subparagraph, even though the paragraph `bar`\nis not indented as far as the first paragraph `foo`:\n\n``` markdown\n 10. foo\n\n bar \n```" +- "\nRather than requiring a fixed indent from the margin, we could require\na fixed indent (say, two spaces, or even one space) from the list marker (which\nmay itself be indented). This proposal would remove the last anomaly\n" +- "discussed. Unlike the spec presented above, it would count the following\nas a list item with a subparagraph, even though the paragraph `bar`\nis not indented as far as the first paragraph `foo`:\n\n``` markdown\n 10. foo\n\n bar \n```" - "\n\nArguably this text does read like a list item with `bar` as a subparagraph,\nwhich may count in favor of the proposal. However, on this proposal indented\ncode would have to be indented six spaces after the list marker. And this\n" - "would break a lot of existing Markdown, which has the pattern:\n\n``` markdown\n1. foo\n\n indented code\n```" - "\n\nwhere the code is indented eight spaces. The spec above, by contrast, will\nparse this text as expected, since the code block's indentation is measured\nfrom the beginning of `foo`.\n" @@ -567,8 +587,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that in such cases, we require one space indentation from the list marker\n(and then the normal four spaces for the indented code). This will match the\nfour-space rule in cases where the list marker plus its initial indentation\n" - "takes four spaces (a common case), but diverge in other cases.\n\n" - "## Lists\n\nA [list](@) is a sequence of one or more\nlist items [of the same type]. The list items\nmay be separated by any number of blank lines.\n" -- "\nTwo list items are [of the same type](@)\nif they begin with a [list marker] of the same type.\nTwo list markers are of the\nsame type if (a) they are bullet list markers using the same character\n" -- "(`-`, `+`, or `*`) or (b) they are ordered list numbers with the same\ndelimiter (either `.` or `)`).\n\n" +- "\nTwo list items are [of the same type](@)\nif they begin with a [list marker] of the same type.\nTwo list markers are of the\nsame type if (a) they are bullet list markers using the same character\n(`-`, `+`, or `*`" +- ") or (b) they are ordered list numbers with the same\ndelimiter (either `.` or `)`).\n\n" - "A list is an [ordered list](@)\nif its constituent list items begin with\n[ordered list markers], and a\n[bullet list](@) if its constituent list\nitems begin with [bullet list markers].\n" - "\nThe [start number](@)\nof an [ordered list] is determined by the list number of\nits initial list item. The numbers of subsequent list items are\ndisregarded.\n" - "\nA list is [loose](@) if any of its constituent\nlist items are separated by blank lines, or if any of its constituent\nlist items directly contain two block-level elements with a blank line\nbetween them. Otherwise a list is [tight](@).\n" @@ -621,9 +641,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n`hi`lo`\n.\n

    hilo`

    \n````````````````````````````````\n\n`hi` is parsed as code, leaving the backtick at the end as a literal\nbacktick.\n\n\n\n" - "## Code spans\n\nA [backtick string](@)\nis a string of one or more backtick characters (`` ` ``) that is neither\npreceded nor followed by a backtick.\n" - "\nA [code span](@) begins with a backtick string and ends with\na backtick string of equal length. The contents of the code span are\nthe characters between these two backtick strings, normalized in the\nfollowing ways:\n\n" -- "- First, [line endings] are converted to [spaces].\n- If the resulting string both begins *and* ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back. " -- "This allows you to include code that begins\n or ends with backtick characters, which must be separated by\n whitespace from the opening or closing backtick strings.\n\nThis is a simple code span:\n" -- "\n```````````````````````````````` example\n`foo`\n.\n

    foo

    \n````````````````````````````````\n\n\nHere two backticks are used, because the code contains a backtick.\nThis example also illustrates stripping of a single leading and\ntrailing space:\n" +- "- First, [line endings] are converted to [spaces].\n" +- "- If the resulting string both begins *and* ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back. This allows you to include code that begins\n " +- "or ends with backtick characters, which must be separated by\n whitespace from the opening or closing backtick strings.\n\nThis is a simple code span:\n\n```````````````````````````````` example\n`foo`\n.\n

    foo

    \n````````````````````````````````" +- "\n\n\nHere two backticks are used, because the code contains a backtick.\nThis example also illustrates stripping of a single leading and\ntrailing space:\n" - "\n```````````````````````````````` example\n`` foo ` bar ``\n.\n

    foo ` bar

    \n````````````````````````````````\n\n\nThis example shows the motivation for stripping leading and trailing\nspaces:\n" - "\n```````````````````````````````` example\n` `` `\n.\n

    ``

    \n````````````````````````````````\n\nNote that only *one* space is stripped:\n" - "\n```````````````````````````````` example\n` `` `\n.\n

    ``

    \n````````````````````````````````\n\nThe stripping only happens if the space is on both\nsides of the string:\n" @@ -655,40 +676,49 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n``` markdown\ninternal emphasis: foo*bar*baz\nno emphasis: foo_bar_baz\n```\n\nThe rules given below capture all of these patterns, while allowing\nfor efficient parsing strategies that do not backtrack.\n" - "\nFirst, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by\n" - "a non-backslash-escaped `_` character.\n\n" -- "A [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\n" -- "preceded by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\n" -- "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\n" -- "followed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" -- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n" -- " abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" +- "A [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\n" +- "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" +- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n" +- " - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n" -- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags)" -- ".\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n" -- " and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n" -- "3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" -- "4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n" -- " followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n" -- "6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n" -- " preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" -- "8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n" -- " followed by a [Unicode punctuation character].\n\n" -- "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n" -- " opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n " -- "must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n" -- "10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n" -- " opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n " +- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\"" +- " and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n" +- "2. " +- "A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" +- "4. " +- "A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n" +- "6. " +- "A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n" +- "8. " +- "A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character].\n\n" +- "9. " +- "Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate" +- "\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n " +- "multiples of 3.\n\n" +- "10. " +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n " +- "opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n " - "delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "Where rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n" -- " preferred to `...`.\n\n" -- "15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n" -- " `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n" -- "16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n" -- " `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n" -- "17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. " -- "Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:\n" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n" +- "14. An interpretation `...` is always\n preferred to `...`.\n\n" +- "15. " +- "When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_`" +- " rather\n than `*foo bar* baz`.\n\n" +- "16. " +- "When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`" +- "\n rather than `foo **bar baz`.\n\n" +- "17. " +- "Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, " +- "`*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:\n" - "\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n" - "\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" - "\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n" @@ -809,20 +839,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A link contains [link text] (the visible text), a [link destination]\n(the URI that is the link destination), and optionally a [link title].\nThere are two basic kinds of links in Markdown. In [inline links] the\n" - "destination and title are given immediately after the link text. In\n[reference links] the destination and title are defined elsewhere in\nthe document.\n\n" - "A [link text](@) consists of a sequence of zero or more\ninline elements enclosed by square brackets (`[` and `]`). The\nfollowing rules apply:\n\n" -- "- Links may not contain other links, at any level of nesting. If\n multiple otherwise valid link definitions appear nested inside each\n other, the inner-most definition is used.\n\n- Brackets are allowed in the [link text] only if (a) they\n" -- " are backslash-escaped or (b) they appear as a matched pair of brackets,\n with an open bracket `[`, a sequence of zero or more inlines, and\n a close bracket `]`.\n\n" +- "- Links may not contain other links, at any level of nesting. If\n multiple otherwise valid link definitions appear nested inside each\n other, the inner-most definition is used.\n\n" +- "- Brackets are allowed in the [link text] only if (a) they\n are backslash-escaped or (b) they appear as a matched pair of brackets,\n with an open bracket `[`, a sequence of zero or more inlines, and\n a close bracket `]`.\n\n" - "- Backtick [code spans], [autolinks], and raw [HTML tags] bind more tightly\n than the brackets in link text. Thus, for example,\n `` [foo`]` `` could not be a link text, since the second `]`\n is part of a code span.\n\n" - "- The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis]. Thus, for example, `*[foo*](url)` is a link.\n\nA [link destination](@) consists of either\n\n" -- "- a sequence of zero or more characters between an opening `<` and a\n closing `>` that contains no line endings or unescaped\n `<` or `>` characters, or\n\n- a nonempty sequence of characters that does not start with `<`,\n" -- " does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are\n backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses.\n " -- "(Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)\n\nA [link title](@) consists of either\n\n" -- "- a sequence of zero or more characters between straight double-quote\n characters (`\"`), including a `\"` character only if it is\n backslash-escaped, or\n\n- a sequence of zero or more characters between straight single-quote\n" -- " characters (`'`), including a `'` character only if it is\n backslash-escaped, or\n\n- a sequence of zero or more characters between matching parentheses\n (`(...)`), including a `(` or `)` character only if it is\n backslash-escaped.\n\n" +- "- a sequence of zero or more characters between an opening `<` and a\n closing `>` that contains no line endings or unescaped\n `<` or `>` characters, or\n\n" +- "- " +- "a nonempty sequence of characters that does not start with `<`,\n does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are\n " +- "backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses.\n (Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)\n\n" +- "A [link title](@) consists of either\n\n" +- "- a sequence of zero or more characters between straight double-quote\n characters (`\"`), including a `\"` character only if it is\n backslash-escaped, or\n\n" +- "- a sequence of zero or more characters between straight single-quote\n characters (`'`), including a `'` character only if it is\n backslash-escaped, or\n\n" +- "- a sequence of zero or more characters between matching parentheses\n (`(...)`), including a `(` or `)` character only if it is\n backslash-escaped.\n\n" - "Although [link titles] may span multiple lines, they may not contain\na [blank line].\n" -- "\nAn [inline link](@) consists of a [link text] followed immediately\nby a left parenthesis `(`, an optional [link destination], an optional\n[link title], and a right parenthesis `)`.\n" -- "These four components may be separated by spaces, tabs, and up to one line\nending.\nIf both [link destination] and [link title] are present, they *must* be\nseparated by spaces, tabs, and up to one line ending.\n\n" -- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>` if present, with backslash-escapes in effect as described\nabove. " -- "The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:\n" +- "\nAn [inline link](@) consists of a [link text] followed immediately\nby a left parenthesis `(`, an optional [link destination], an optional\n[link title], and a right parenthesis `)`.\nThese four components may be separated by spaces, tabs, and up to one line" +- "\nending.\nIf both [link destination] and [link title] are present, they *must* be\nseparated by spaces, tabs, and up to one line ending.\n\n" +- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>` if present, with backslash-escapes in effect as described\n" +- "above. The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:\n" - "\n```````````````````````````````` example\n[link](/uri \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nThe title, the link text and even \nthe destination may be omitted:\n" - "\n```````````````````````````````` example\n[link](/uri)\n.\n

    link

    \n````````````````````````````````\n\n```````````````````````````````` example\n[](./target.md)\n.\n

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[link](<>)\n.\n

    link

    \n````````````````````````````````" @@ -839,12 +872,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n[link](foo\\(and\\(bar\\))\n.\n

    link

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\nParentheses and other symbols can also be escaped, as usual\nin Markdown:\n" - "\n```````````````````````````````` example\n[link](foo\\)\\:)\n.\n

    link

    \n````````````````````````````````\n\n\nA link can contain fragment identifiers and queries:\n" -- "\n```````````````````````````````` example\n[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    \n" -- "

    link

    \n````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:\n" -- "\n```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" -- "\n\n\nURL-escaping should be left alone inside the destination, as all\nURL-escaped characters are also valid URL characters. Entity and\nnumerical character references in the destination will be parsed\ninto the corresponding Unicode code points, as usual. " -- "These may\nbe optionally URL-escaped when written as HTML, but this spec\ndoes not enforce any particular policy for rendering URLs in\nHTML or other formats. Renderers may make different decisions\nabout how to escape or normalize URLs in the output.\n\n" -- "```````````````````````````````` example\n[link](foo%20bä)\n.\n

    link

    \n````````````````````````````````" +- "\n```````````````````````````````` example\n" +- "[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    \n

    link

    \n" +- "````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:\n\n```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" +- "\n\n\nURL-escaping should be left alone inside the destination, as all\nURL-escaped characters are also valid URL characters. Entity and\nnumerical character references in the destination will be parsed\n" +- "into the corresponding Unicode code points, as usual. These may\nbe optionally URL-escaped when written as HTML, but this spec\ndoes not enforce any particular policy for rendering URLs in\nHTML or other formats. Renderers may make different decisions\n" +- "about how to escape or normalize URLs in the output.\n\n```````````````````````````````` example\n[link](foo%20bä)\n.\n

    link

    \n````````````````````````````````" - "\n\n\nNote that, because titles can often be parsed as destinations,\nif you try to omit the destination and keep the title, you'll\nget unexpected results:\n" - "\n```````````````````````````````` example\n[link](\"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nTitles may be in single quotes, double quotes, or parentheses:\n" - "\n```````````````````````````````` example\n[link](/url \"title\")\n[link](/url 'title')\n[link](/url (title))\n.\n

    link\nlink\nlink

    \n" @@ -854,10 +887,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n[link](/url \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nNested balanced quotes are not allowed without escaping:\n" - "\n```````````````````````````````` example\n[link](/url \"title \"and\" title\")\n.\n

    [link](/url "title "and" title")

    \n````````````````````````````````\n\n\nBut it is easy to work around this by using a different quote type:\n" - "\n```````````````````````````````` example\n[link](/url 'title \"and\" title')\n.\n

    link

    \n````````````````````````````````" -- "\n\n\n(Note: `Markdown.pl` did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this\n" -- "brings, since there are already many ways---backslash escaping,\nentity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`" -- "'s handling of titles has a number\nof other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`. " -- "`Markdown.pl` 1.0.1 even allows\ntitles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)\n\n" +- "\n\n\n(Note: `Markdown.pl` did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this\nbrings, since there are already many ways---" +- "backslash escaping,\nentity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`'s handling of titles has a number\n" +- "of other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`. `Markdown.pl` 1.0.1 even allows\n" +- "titles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)\n\n" - "Spaces, tabs, and up to one line ending is allowed around the destination and\ntitle:\n\n```````````````````````````````` example\n[link]( /uri\n \"title\" )\n.\n

    link

    \n````````````````````````````````" - "\n\n\nBut it is not allowed between the link text and the\nfollowing parenthesis:\n\n```````````````````````````````` example\n[link] (/uri)\n.\n

    [link] (/uri)

    \n````````````````````````````````" - "\n\n\nThe link text may contain balanced brackets, but not unbalanced ones,\nunless they are escaped:\n\n```````````````````````````````` example\n[link [foo [bar]]](/uri)\n.\n

    link [foo [bar]]

    \n````````````````````````````````" @@ -898,8 +931,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n[Foo\n bar]: /url\n\n[Baz][Foo bar]\n.\n

    Baz

    \n````````````````````````````````\n\n\nNo spaces, tabs, or line endings are allowed between the [link text] and the\n[link label]:\n" - "\n```````````````````````````````` example\n[foo] [bar]\n\n[bar]: /url \"title\"\n.\n

    [foo] bar

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[foo]\n[bar]\n\n[bar]: /url \"title\"\n.\n

    [foo]\nbar

    \n````````````````````````````````" -- "\n\n\nThis is a departure from John Gruber's original Markdown syntax\ndescription, which explicitly allows whitespace between the link\ntext and the link label. It brings reference links in line with\n" -- "[inline links], which (according to both original Markdown and\nthis spec) cannot have whitespace after the link text. More\nimportantly, it prevents inadvertent capture of consecutive\n[shortcut reference links]. If whitespace is allowed between the\n" +- "\n\n\nThis is a departure from John Gruber's original Markdown syntax\ndescription, which explicitly allows whitespace between the link\ntext and the link label. It brings reference links in line with\n[inline links]" +- ", which (according to both original Markdown and\nthis spec) cannot have whitespace after the link text. More\nimportantly, it prevents inadvertent capture of consecutive\n[shortcut reference links]. If whitespace is allowed between the\n" - "link text and the link label, then in the following we will have\na single reference link, not two shortcut reference links, as\nintended:\n\n``` markdown\n[foo]\n[bar]\n\n[foo]: /url1\n[bar]: /url2\n```" - "\n\n(Note that [shortcut reference links] were introduced by Gruber\nhimself in a beta version of `Markdown.pl`, but never included\nin the official syntax description. Without shortcut reference\nlinks, it is harmless to allow space between the link text and\n" - "link label; but once shortcut references are introduced, it is\ntoo dangerous to allow this, as it frequently leads to\nunintended results.)\n\nWhen there are multiple matching [link reference definitions],\nthe first is used:\n" @@ -912,8 +945,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n[foo][ref\\[]\n\n[ref\\[]: /uri\n.\n

    foo

    \n````````````````````````````````\n\n\nNote that in this example `]` is not backslash-escaped:\n" - "\n```````````````````````````````` example\n[bar\\\\]: /uri\n\n[bar\\\\]\n.\n

    bar\\

    \n````````````````````````````````\n\n\nA [link label] must contain at least one character that is not a space, tab, or\nline ending:\n" - "\n```````````````````````````````` example\n[]\n\n[]: /uri\n.\n

    []

    \n

    []: /uri

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [\n]: /uri

    \n````````````````````````````````" -- "\n\n\nA [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link's text. " -- "The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`.\n\n" +- "\n\n\nA [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link'" +- "s text. The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`.\n\n" - "```````````````````````````````` example\n[foo][]\n\n[foo]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n[*foo* bar][]\n\n[*foo* bar]: /url \"title\"\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThe link labels are case-insensitive:\n" - "\n```````````````````````````````` example\n[Foo][]\n\n[foo]: /url \"title\"\n.\n

    Foo

    \n````````````````````````````````" @@ -944,8 +977,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\n![foo *bar*]\n\n[foo *bar*]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n![foo ![bar](/url)](/url2)\n.\n

    \"foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n![foo [bar](/url)](/url2)\n.\n

    \"foo

    \n````````````````````````````````" -- "\n\n\nThough this spec is concerned with parsing, not rendering, it is\nrecommended that in rendering to HTML, only the plain string content\nof the [image description] be used. Note that in\nthe above example, the alt attribute's value is `foo bar`, not `foo\n" -- "[bar](/url)` or `foo bar`. Only the plain string\ncontent is rendered, without formatting.\n\n" +- "\n\n\nThough this spec is concerned with parsing, not rendering, it is\nrecommended that in rendering to HTML, only the plain string content\nof the [image description] be used. Note that in\nthe above example, the alt attribute's value is `foo bar`, not " +- "`foo\n[bar](/url)` or `foo bar`. Only the plain string\ncontent is rendered, without formatting.\n\n" - "```````````````````````````````` example\n![foo *bar*][]\n\n[foo *bar*]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n![foo *bar*][foobar]\n\n[FOOBAR]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n![foo](train.jpg)\n.\n

    \"foo\"

    \n````````````````````````````````" @@ -1036,8 +1069,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n```````````````````````````````` example\nfoo \n.\n

    foo

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n### foo\\\n.\n

    foo\\

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n### foo \n.\n

    foo

    \n````````````````````````````````\n\n\n" - "## Soft line breaks\n\n" -- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@). (A soft line break may be rendered in HTML either as a\n[line ending] or as a space. " -- "The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)\n\n```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````" +- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@). (A soft line break may be rendered in HTML either as a\n[line ending]" +- " or as a space. The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)\n\n```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````" - "\n\n\nSpaces at the end of the line and beginning of the next line are\nremoved:\n\n```````````````````````````````` example\nfoo \n baz\n.\n

    foo\nbaz

    \n````````````````````````````````" - "\n\n\nA conforming parser may render a soft line break in HTML either as a\nline ending or as a space.\n\nA renderer may also provide an option to render soft line breaks\nas hard line breaks.\n\n" - "## Textual content\n\nAny characters not given an interpretation by the above rules will\nbe parsed as plain textual content.\n\n```````````````````````````````` example\nhello $.;'there\n.\n

    hello $.;'there

    \n````````````````````````````````" @@ -1045,21 +1078,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\nMultiple spaces\n.\n

    Multiple spaces

    \n````````````````````````````````\n\n\n\n\n" - "# Appendix: A parsing strategy\n\nIn this appendix we describe some features of the parsing strategy\nused in the CommonMark reference implementations.\n\n" - "## Overview\n\nParsing has two phases:\n\n" -- "1. In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these\nblocks but not parsed. " -- "Link reference definitions are parsed and a\nmap of links is constructed.\n\n" +- "1. " +- "In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these\n" +- "blocks but not parsed. Link reference definitions are parsed and a\nmap of links is constructed.\n\n" - "2. In the second phase, the raw text contents of paragraphs and headings\nare parsed into sequences of Markdown inline elements (strings,\ncode spans, links, emphasis, and so on), using the map of link\nreferences constructed in phase 1.\n\n" - "At each point in processing, the document is represented as a tree of\n**blocks**. The root of the tree is a `document` block. The `document`\nmay have any number of other blocks as **children**. These children\n" -- "may, in turn, have other blocks as children. The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**" -- ".)\nHere, for example, is a possible document tree, with the open blocks\nmarked by arrows:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n" -- " -> paragraph\n \"aliquando id\"\n```\n\n" +- "may, in turn, have other blocks as children. The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**.)\n" +- "Here, for example, is a possible document tree, with the open blocks\nmarked by arrows:\n\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph\n" +- " \"aliquando id\"\n```\n\n" - "## Phase 1: block structure\n\nEach line that is processed has an effect on this tree. The line is\nanalyzed and, depending on its contents, the document may be altered\nin one or more of the following ways:\n\n" - "1. One or more open blocks may be closed.\n2. One or more new blocks may be created as children of the\n last open block.\n3. Text may be added to the last (deepest) open block remaining\n on the tree.\n\n" - "Once a line has been incorporated into the tree in this way,\nit can be discarded, so input can be read in a stream.\n\nFor each line, we follow this procedure:\n\n" -- "1. First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy\nif the block is to remain open. " -- "For example, a block quote requires a\n`>` character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[lazy continuation line].\n\n" -- "2. Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>` for a block quote).\nIf we encounter a new block start, we close any blocks unmatched\n" -- "in step 1 before creating the new block as a child of the last\nmatched container block.\n\n" +- "1. " +- "First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy\n" +- "if the block is to remain open. For example, a block quote requires a\n`>` character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[" +- "lazy continuation line].\n\n" +- "2. " +- "Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>` for a block quote).\nIf we encounter a new block start, we close any blocks unmatched\nin step 1 before creating the new block as a child of the last\n" +- "matched container block.\n\n" - "3. Finally, we look at the remainder of the line (after block\nmarkers like `>`, list markers, and indentation have been consumed).\nThis is text that can be incorporated into the last open\nblock (a paragraph, code block, heading, or raw HTML).\n\n" - "Setext headings are formed when we see a line of a paragraph\nthat is a [setext heading underline].\n" - "\nReference link definitions are detected when a paragraph is closed;\nthe accumulated text lines are parsed to see if they begin with\none or more reference link definitions. Any remainder becomes a\nnormal paragraph.\n" @@ -1068,39 +1106,42 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\ncauses a `block_quote` block to be created as a child of our\nopen `document` block, and a `paragraph` block as a child of\nthe `block_quote`. Then the text is added to the last open\nblock, the `paragraph`:\n" - "\n``` tree\n-> document\n -> block_quote\n -> paragraph\n \"Lorem ipsum dolor\"\n```\n\nThe next line,\n\n``` markdown\nsit amet.\n```\n\nis a \"lazy continuation\" of the open `paragraph`, so it gets added\nto the paragraph's text:\n" - "\n``` tree\n-> document\n -> block_quote\n -> paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n```\n\nThe third line,\n\n``` markdown\n> - Qui *quodsi iracundia*\n```" -- "\n\ncauses the `paragraph` block to be closed, and a new `list` block\nopened as a child of the `block_quote`. A `list_item` is also\nadded as a child of the `list`, and a `paragraph` as a child of\nthe `list_item`. " -- "The text is then added to the new `paragraph`:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n -> list_item\n -> paragraph\n \"Qui *quodsi iracundia*\"\n```\n\nThe fourth line,\n" -- "\n``` markdown\n> - aliquando id\n```" +- "\n\ncauses the `paragraph` block to be closed, and a new `list` block\nopened as a child of the `block_quote`. A `list_item` is also\nadded as a child of the `list`, and a `paragraph` as a child of\nthe `list_item`. The text is then added to the new " +- "`paragraph`:\n\n``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n -> list_item\n -> paragraph\n \"Qui *quodsi iracundia*\"\n```" +- "\n\nThe fourth line,\n\n``` markdown\n> - aliquando id\n```" - "\n\ncauses the `list_item` (and its child the `paragraph`) to be closed,\nand a new `list_item` opened up as child of the `list`. A `paragraph`\nis added as a child of the new `list_item`, to contain the text.\nWe thus obtain the final tree:\n" -- "\n``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n" -- " -> paragraph\n \"aliquando id\"\n```\n\n" +- "\n``` tree\n" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph\n" +- " \"aliquando id\"\n```\n\n" - "## Phase 2: inline structure\n\nOnce all of the input has been parsed, all open blocks are closed.\n" - "\nWe then \"walk the tree,\" visiting every node, and parse raw\nstring contents of paragraphs and headings as inlines. At this\npoint we have seen all the link reference definitions, so we can\nresolve reference links as we go.\n" -- "\n``` tree\ndocument\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph\n" +- "\n``` tree\n" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph\n" - " str \"quodsi iracundia\"\n list_item\n paragraph\n str \"aliquando id\"\n```\n\nNotice how the [line ending] in the first paragraph has\nbeen parsed as a `softbreak`, and the asterisks in the first list item\nhave become an `emph`.\n\n" - "### An algorithm for parsing nested emphasis and links\n\nBy far the trickiest part of inline parsing is handling emphasis,\nstrong emphasis, links, and images. This is done using the following\nalgorithm.\n\nWhen we're parsing inlines and we hit either\n\n" - "- a run of `*` or `_` characters, or\n- a `[` or `![`\n\n" - "we insert a text node with these symbols as its literal content, and we\nadd a pointer to this text node to the [delimiter stack](@).\n\nThe [delimiter stack] is a doubly linked list. Each\nelement contains a pointer to a text node, plus information about\n\n" -- "- the type of delimiter (`[`, `![`, `*`, `_`)\n- the number of delimiters,\n- whether the delimiter is \"active\" (all are active to start), and\n- whether the delimiter is a potential opener, a potential closer,\n" -- " or both (which depends on what sort of characters precede\n and follow the delimiters).\n\nWhen we hit a `]` character, we call the *look for link or image*\nprocedure (see below).\n" -- "\nWhen we hit the end of the input, we call the *process emphasis*\nprocedure (see below), with `stack_bottom` = NULL.\n\n" +- "- the type of delimiter (`[`, `![`, `*`, `_`)\n- the number of delimiters,\n- whether the delimiter is \"active\" (all are active to start), and\n" +- "- whether the delimiter is a potential opener, a potential closer,\n or both (which depends on what sort of characters precede\n and follow the delimiters).\n\n" +- "When we hit a `]` character, we call the *look for link or image*\nprocedure (see below).\n\nWhen we hit the end of the input, we call the *process emphasis*\nprocedure (see below), with `stack_bottom` = NULL.\n\n" - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter.\n\n" -- "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n" -- " we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n " -- "+ If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n" -- " after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n" -- " * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)\n\n" +- "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n" +- "- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n " +- "+ If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n" +- " + If we do, then\n\n " +- "* We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n" +- " * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n" - "\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n" - "\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n" - "\nThen we repeat the following until we run out of potential\nclosers:\n\n" - "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n" - "- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n " -- "+ Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n" -- " the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n" -- " + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. " -- "If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:\n\n " +- "+ Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n" +- " + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n" +- " + " +- "Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n " +- "of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:\n\n " - "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n" - " + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\n" - "After we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap index a55e245..1472906 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown@commonmark_spec.md.snap @@ -19,8 +19,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - org/licenses/by- - "sa/4.0/)'\n...\n\n" - "# Introduction\n\n" -- "## What is " -- "Markdown?\n\n" +- "## " +- What is Markdown +- "?\n\n" - "Markdown is a " - "plain text " - "format for " @@ -50,10 +51,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - markdown/syntax) - "\n" - "and a Perl " -- "script (`" -- "Markdown.pl`) " -- "for converting " -- "Markdown to\n" +- script ( +- "`Markdown.pl`" +- ) for converting +- " Markdown to\n" - "HTML. " - "In the next " - "decade, dozens " @@ -105,10 +106,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - As Gruber writes - ":\n\n" -- "> The overriding" -- " design goal for" -- " Markdown's " -- "formatting " +- "> " +- "The overriding " +- "design goal for " +- "Markdown'" +- "s formatting " - "syntax is\n> " - "to make it as " - "readable as " @@ -122,12 +124,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as-is, as\n> " - "plain text, " - "without looking " -- "like it's been " -- "marked up with " -- "tags\n> " +- "like it'" +- s been marked up +- " with tags\n> " - "or formatting " - "instructions.\n> " -- "()\n\n" @@ -146,7 +149,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "AsciiDoc from " - "the AsciiDoc " - "manual:\n\n" -- "```\n1. " +- "```\n" +- "1. " - "List item one.\n" - "+\n" - "List item one " @@ -190,7 +194,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "And here is the " - "equivalent in " - "Markdown:\n" -- "```\n1. " +- "```\n" +- "1. " - "List item one.\n\n" - " List item " - "one continued " @@ -250,13 +255,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " in the " - "processed " - "document.\n\n" -- "## Why is a spec" -- " needed?\n\n" -- "John Gruber's [" +- "## " +- "Why is a spec " +- "needed?\n\n" +- "John Gruber's " +- "[" - "canonical " - "description of " - "Markdown's\n" -- "syntax](https://" +- syntax +- "](https://" - daringfireball.n - et/projects/ - markdown/syntax) @@ -287,25 +295,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " think that\n " - "they, too, must " - be indented four -- " spaces, but `" -- "Markdown.pl` " -- "does\n " +- " spaces, but " +- "`Markdown.pl`" +- " does\n " - not require that - ". " - This is hardly a -- " \"corner case,\" " -- "and divergences\n" -- " between " +- " \"corner case,\"" +- " and divergences" +- "\n " +- "between " - "implementations " - "on this issue " - "often lead to " - "surprises for\n" -- " users in " -- "real documents. " -- "(See [this " -- "comment by John\n" -- " Gruber](" -- "https://" +- " " +- "users in real " +- "documents. (See " +- "[" +- "this comment by " +- "John\n Gruber" +- "](https://" - web.archive.org/ - web/ - 20170611172104/ @@ -335,14 +345,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "parsing (note " - "that some " - "implementations\n" -- " put the " -- "heading inside " -- "the blockquote, " +- " " +- "put the heading " +- "inside the " +- "blockquote, " - "while others do " - "not).\n " - (John Gruber has -- " also spoken [in" -- " favor of " +- " also spoken " +- "[" +- "in favor of " - "requiring the " - "blank\n lines" - "](https://" @@ -360,9 +372,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - needed before an - " indented code " - "block?\n (" -- "`Markdown.pl` " -- "requires it, but" -- " this is not " +- "`Markdown.pl`" +- " requires it, " +- "but this is not " - mentioned in the - "\n " - "documentation, " @@ -379,8 +391,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "exact rule for " - determining when - " list items get\n" -- " wrapped in `" -- "

    `" +- " wrapped in " +- "`

    `" - " tags? " - "Can a list be " - "partially \"loose" @@ -433,8 +445,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " with a thematic" - " break in its " - "second item,\n" -- " or two lists" -- " separated by a " +- " " +- "or two lists " +- "separated by a " - "thematic break?\n" - "\n" - " ``` markdown" @@ -453,9 +466,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "syntax " - "description " - "suggests two,\n" -- " but the perl" -- " scripts and " -- "many other " +- " " +- "but the perl " +- scripts and many +- " other " - "implementations " - "produce one.)\n\n" - " ``` markdown" @@ -477,8 +491,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "take precedence " - "?\n\n " - "``` markdown\n" -- " [a backtick " -- "(`)](/url) and [" +- " " +- "[a backtick (`)]" +- "(/url) and [" - another backtick - " (`)](/url).\n" - " ```\n\n" @@ -494,8 +509,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "following be " - "parsed?\n\n " - "``` markdown\n" -- " *foo *bar* " -- "baz*\n ```\n\n" +- " " +- "*foo *bar* baz*\n" +- " ```\n\n" - "10. " - "What are the " - precedence rules @@ -508,10 +524,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "following be " - "parsed?\n\n " - "``` markdown\n" -- " - `a long " -- "code span can " -- contain a hyphen -- " like this\n " +- " " +- "- `a long code " +- span can contain +- " a hyphen like " +- "this\n " - " - and it can " - "screw things up`" - "\n ```\n\n" @@ -519,8 +536,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Can list items " - "include section " - headings? ( -- "`Markdown.pl` " -- "does not\n " +- "`Markdown.pl`" +- " does not\n " - "allow this, but " - "does allow " - "blockquotes to " @@ -560,19 +577,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " ``` markdown" - "\n " - "[foo]: /url1\n" -- " [foo]: /url2" -- "\n\n [foo][]\n" +- " " +- "[foo]: /url2\n\n" +- " [foo][]\n" - " ```\n\n" - "In the absence " - "of a spec, early" - " implementers " -- "consulted `" -- "Markdown.pl`\n" +- "consulted " +- "`Markdown.pl`\n" - to resolve these - " ambiguities. " -- "But `Markdown.pl" -- "` was quite " -- "buggy, and\n" +- "But " +- "`Markdown.pl`" +- " was quite buggy" +- ", and\n" - "gave manifestly " - "bad results in " - "many cases, so " @@ -605,13 +624,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "worse, because " - "nothing in " - "Markdown counts\n" -- "as a \"syntax " -- "error,\" the " -- divergence often -- " isn't " -- discovered right -- " away.\n\n" -- "## About this " +- "as a \"" +- "syntax error,\"" +- " the divergence " +- "often isn'" +- "t discovered " +- "right away.\n\n" +- "## " +- "About this " - "document\n\n" - "This document " - "attempts to " @@ -629,9 +649,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "conformance " - "tests. An\n" - "accompanying " -- "script `" -- "spec_tests.py` " -- "can be used to " +- "script " +- "`spec_tests.py`" +- " can be used to " - "run the tests\n" - "against any " - Markdown program @@ -685,9 +705,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - counts as a link - "\n" - "destination, but" -- " it doesn't " -- mandate that non -- "-ASCII " +- " it doesn'" +- "t mandate that " +- "non-ASCII " - "characters in\n" - "the URL be " - percent-encoded. @@ -719,20 +739,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "URLs.\n\n" - This document is - " generated from " -- "a text file, `" -- "spec.txt`, " -- "written\n" +- "a text file, " +- "`spec.txt`" +- ", written\n" - in Markdown with - " a small " - "extension for " - the side-by-side - " tests.\n" -- "The script `" -- tools/ -- "makespec.py` can" -- " be used to " -- "convert `" -- "spec.txt` into\n" +- "The script " +- "`tools/" +- "makespec.py`" +- " can be used to " +- "convert " +- "`spec.txt` into\n" - "HTML or " - CommonMark ( - "which can then " @@ -740,18 +760,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "into other " - "formats).\n\n" - "In the examples," -- " the `→` " -- "character is " +- " the `→`" +- " character is " - "used to " - "represent tabs.\n" - "\n" - "# Preliminaries" - "\n\n" -- "## Characters " -- "and lines\n\n" +- "## " +- "Characters and " +- "lines\n\n" - "Any sequence of " -- "[characters] is " -- "a valid " +- "[characters]" +- " is a valid " - "CommonMark\n" - "document.\n\n" - "A [character](@)" @@ -775,31 +796,32 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "encoding; it " - "thinks of lines " - "as composed\nof [" -- "characters] " -- "rather than " +- "characters]" +- " rather than " - "bytes. " - "A conforming " - "parser may be " - "limited\n" - "to a certain " - "encoding.\n\n" -- "A [line](@) is a" -- " sequence of " -- "zero or more [" -- "characters]\n" +- "A [line](@)" +- " is a sequence " +- "of zero or more " +- "[characters]\n" - "other than line " -- "feed (`U+000A`) " -- "or carriage " +- "feed (`U+000A`" +- ") or carriage " - "return (`U+000D`" - "),\n" - "followed by a [" -- "line ending] or " -- "by the end of " -- "file.\n\n" -- "A [line ending](" -- "@) is a line " -- "feed (`U+000A`)," -- " a carriage " +- "line ending]" +- " or by the end " +- "of file.\n\n" +- "A " +- "[line ending](@)" +- " is a line feed " +- "(`U+000A`" +- "), a carriage " - "return\n(`U+000D`" - ") not followed " - "by a line feed, " @@ -812,10 +834,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters, or a" - " line containing" - " only spaces\n(" -- "`U+0020`) or " -- "tabs (`U+0009`)," -- " is called a [" -- "blank line](@)." +- "`U+0020`" +- ) or tabs ( +- "`U+0009`" +- "), is called a " +- "[blank line](@)." - "\n\n" - "The following " - "definitions of " @@ -823,65 +846,72 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "classes will be " - "used in this " - "spec:\n\n" -- "A [Unicode " +- "A " +- "[Unicode " - "whitespace " -- "character](@) is" -- " a character in " -- "the Unicode `Zs`" -- " general\n" +- "character](@)" +- " is a character " +- "in the Unicode " +- "`Zs` general\n" - "category, or a " -- "tab (`U+0009`), " -- "line feed (`U+" -- "000A`), form " -- "feed (`U+000C`)," -- " or\n" +- "tab (`U+0009`" +- "), line feed (" +- "`U+000A`" +- "), form feed (" +- "`U+000C`), or\n" - "carriage return " - "(`U+000D`).\n\n" - "[Unicode " -- "whitespace](@) " -- is a sequence of -- " one or more\n[" +- "whitespace](@)" +- " is a sequence " +- "of one or more\n[" - "Unicode " - "whitespace " - "characters].\n\n" -- "A [tab](@) is `U" -- "+0009`.\n\n" +- "A [tab](@) is " +- "`U+0009`.\n\n" - "A [space](@) is " - "`U+0020`.\n\n" -- "An [ASCII " -- "control " -- "character](@) is" -- " a character " -- "between `U+0000–" -- "1F` (both\n" -- "including) or `U" -- "+007F`.\n\n" -- "An [ASCII " +- "An " +- "[ASCII control " +- "character](@)" +- " is a character " +- "between " +- "`U+0000–1F`" +- " (both\n" +- "including) or " +- "`U+007F`.\n\n" +- "An " +- "[ASCII " - "punctuation " - "character](@)\n" - "is `!`, `\"`, `#`" -- ", `$`, `%`, `&`," -- " `'`, `(`, `)`,\n" -- "`*`, `+`, `,`, `" -- "-`, `.`, `/`" +- ", `$`, `%`, `&`" +- ", `'`, `(`, `)`," +- "\n`*`, `+`, `,`, " +- "`-`, `.`, `/`" - " (U+0021–2F), \n" -- "`:`, `;`, `<`, `" -- "=`, `>`, `?`, " -- "`@` (U+003A–0040" -- "),\n`[`, `\\`, `]`" -- ", `^`, `_`, `` `" -- " `` (U+005B–0060" -- "), \n`{`, `|`, " -- "`}`, or `~` (U+" -- "007B–007E).\n\n" -- "A [Unicode " +- "`:`, `;`, `<`, " +- "`=`, `>`, `?`, " +- "`@`" +- " (U+003A–0040),\n" +- "`[`, `\\`, `]`, " +- "`^`, `_`, " +- "`` ` ``" +- " (U+005B–0060)," +- " \n`{`, `|`, `}`" +- ", or `~`" +- " (U+007B–007E)." +- "\n\nA " +- "[Unicode " - "punctuation " -- "character](@) is" -- " a character in " -- "the Unicode `P`" -- "\n" +- "character](@)" +- " is a character " +- "in the Unicode " +- "`P`\n" - "(puncuation) or " -- "`S` (symbol) " +- "`S`" +- " (symbol) " - "general " - "categories.\n\n" - "## Tabs\n\n" @@ -982,8 +1012,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "````````````````" - "````````````````" -- "\n\n" -- "Normally the `>`" +- "\n\nNormally the " +- "`>`" - " that begins a " - "block quote may " - "be followed\n" @@ -993,9 +1023,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "part of the\n" - "content. " - In the following -- " case `>` is " -- "followed by a " -- "tab,\n" +- " case `>`" +- " is followed by " +- "a tab,\n" - which is treated - " as if it were " - "expanded into " @@ -1076,19 +1106,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Insecure " +- "## " +- "Insecure " - "characters\n\n" - "For security " - "reasons, the " - "Unicode " -- "character `U+" -- "0000` must be " +- "character " +- "`U+0000`" +- " must be " - "replaced\n" - "with the " - "REPLACEMENT " -- "CHARACTER (`U+" -- "FFFD`).\n\n\n" -- "## Backslash " +- CHARACTER ( +- "`U+FFFD`).\n\n\n" +- "## " +- "Backslash " - "escapes\n\n" - "Any ASCII " - "punctuation " @@ -1191,8 +1224,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "A backslash at " - "the end of the " -- "line is a [hard " -- "line break]:\n\n" +- "line is a [" +- "hard line break]" +- ":\n\n" - "````````````````" - "````````````````" - " example\n" @@ -1267,8 +1301,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "and link titles," - "\n" - "link references," -- " and [info " -- "strings] in [" +- " and [" +- "info strings]" +- " in [" - "fenced code " - "blocks]:\n\n" - "````````````````" @@ -1306,7 +1341,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Entity and " +- "## " +- "Entity and " - "numeric " - "character " - "references\n\n" @@ -1324,14 +1360,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with the " - "following " - "exceptions:\n\n" -- "- Entity and " +- "- " +- "Entity and " - "character " - "references are " - "not recognized " - "in code\n " - "blocks and code " - "spans.\n\n" -- "- Entity and " +- "- " +- "Entity and " - "character " - "references " - "cannot stand in " @@ -1346,10 +1384,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "although `*`" - " can be used\n " - "in place of a " -- "literal `*` " -- "character, `*" -- ";` cannot " -- "replace\n `*`" +- "literal `*`" +- " character, " +- "`*`" +- " cannot replace" +- "\n `*`" - " in emphasis " - "delimiters, " - "bullet list " @@ -1372,13 +1411,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - entity reference - ".\n\n" - "[Entity " -- "references](@) " -- "consist of `&` +" -- " any of the " +- "references](@)" +- " consist of `&`" +- " + any of the " - "valid\n" - "HTML5 entity " -- "names + `;`" -- ". The\ndocument " +- "names + `;`. The" +- "\ndocument " - " " -- "foo` as its " -- "literal text, " -- "you can\n" +- "heading with " +- "`> foo`" +- " as its literal " +- "text, you can\n" - "use backslash " - "escapes:\n\n" - "````````````````" @@ -2903,8 +2956,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "**Compatibility " -- "note:** Most " -- "existing " +- "note:**" +- " Most existing " - "Markdown " - "implementations\n" - do not allow the @@ -2921,21 +2974,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One can find " - "four different " - "interpretations:" -- "\n\n" -- "1. paragraph \"" +- "\n\n1. paragraph \"" - "Foo\", heading \"" - "bar\", paragraph " - "\"baz\"\n" - "2. paragraph \"" -- "Foo bar\", " -- "thematic break, " -- "paragraph \"baz\"\n" +- "Foo bar\"" +- ", thematic break" +- ", paragraph \"baz" +- "\"\n" - "3. paragraph \"" - "Foo bar --- baz\"" -- "\n" -- "4. heading \"Foo " -- "bar\", paragraph " -- "\"baz\"\n\n" +- "\n4. heading \"" +- "Foo bar\"" +- ", paragraph \"baz" +- "\"\n\n" - "We find " - interpretation 4 - " most natural, " @@ -2986,8 +3039,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that cannot " - "count as a [" - "setext heading\n" -- "underline], such" -- " as\n\n" +- "underline]" +- ", such as\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3012,19 +3065,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Indented code" -- " blocks\n\n" -- "An [indented " -- "code block](@) " -- "is composed of " +- "## " +- "Indented code " +- "blocks\n\n" +- "An " +- "[indented code " +- "block](@)" +- " is composed of " - "one or more\n[" - "indented chunks]" - " separated by " - "blank lines.\nAn " - "[indented chunk]" -- "(@) is a " -- sequence of non- -- "blank lines,\n" +- (@) +- " is a sequence " +- "of non-blank " +- "lines,\n" - each preceded by - " four or more " - "spaces of " @@ -3036,8 +3092,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " of the lines, " - "including " - "trailing\n[" -- "line endings], " -- "minus four " +- "line endings]" +- ", minus four " - "spaces of " - "indentation.\n" - An indented code @@ -3084,8 +3140,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "indicating that " - material belongs - " to a [list\nitem" -- "][list items], " -- "the list item " +- "][list items]" +- ", the list item " - "interpretation " - takes precedence - ":\n\n" @@ -3269,8 +3325,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Trailing spaces " - "or tabs are " - "included in the " -- "code block's " -- "content:\n\n" +- "code block'" +- "s content:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3280,17 +3336,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n\n" -- "## Fenced code " +- "## " +- "Fenced code " - "blocks\n\n" -- "A [code fence](@" -- ") is a sequence\n" +- "A " +- "[code fence](@)" +- " is a sequence\n" - "of at least " - "three " - "consecutive " - "backtick " -- "characters (`` `" -- " ``) or\ntildes (" -- "`~`" +- characters ( +- "`` ` ``) or\n" +- "tildes (`~`" - "). " - "(Tildes and " - backticks cannot @@ -3315,10 +3373,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "leading and " - "trailing\n" - "spaces or tabs " -- "and called the [" -- "info string](@)." -- " If the [info " -- "string] comes\n" +- "and called the " +- "[info string](@)" +- ". If the [" +- "info string]" +- " comes\n" - after a backtick - " fence, it may " - "not contain any " @@ -3341,10 +3400,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "consists of all " - subsequent lines - ", until\n" -- "a closing [code " -- "fence] of the " -- same type as the -- " code block\n" +- "a closing [" +- "code fence]" +- " of the same " +- type as the code +- " block\n" - began with ( - "backticks or " - "tildes), and " @@ -3440,16 +3500,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "not parsed\n" - "as inlines. " - "The first word " -- "of the [info " -- "string] is " -- "typically used " -- "to\n" +- "of the [" +- "info string]" +- " is typically " +- "used to\n" - "specify the " - "language of the " - "code sample, and" - " rendered in the" -- " `class`" -- "\n" +- " `class`\n" - attribute of the - " `code`" - " tag. " @@ -3557,11 +3616,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "end of the " - "document\n" - "(or the " -- "enclosing [block" -- " quote][block " -- "quotes] or [list" -- " item][list " -- "items]):\n\n" +- "enclosing [" +- "block quote][" +- "block quotes]" +- " or [list item][" +- "list items]):\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3789,15 +3848,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    baz

    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "An [info string]" +- "\n\n\nAn [" +- "info string]" - " can be provided" - " after the " - "opening code " - "fence.\n" - "Although this " -- "spec doesn't " -- "mandate any " +- "spec doesn'" +- "t mandate any " - "particular " - "treatment of\n" - "the info string," @@ -3811,12 +3870,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "normally " - "indicated by " - "adding a class " -- "to the `code` " -- "element " +- "to the `code`" +- " element " - "consisting\nof " -- "`language-` " -- "followed by the " -- "language name.\n\n" +- "`language-`" +- " followed by the" +- " language name.\n" +- "\n" - "````````````````" - "````````````````" - " example\n" @@ -3857,9 +3917,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code>
    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Info strings] " -- "for backtick " +- "\n\n\n[Info strings" +- "]" +- " for backtick " - "code blocks " - "cannot contain " - "backticks:\n\n" @@ -3872,9 +3932,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code>\nfoo

    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Info strings] " -- "for tilde code " +- "\n\n\n[Info strings" +- "]" +- " for tilde code " - "blocks can " - "contain " - "backticks and " @@ -3893,8 +3953,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "Closing code " - "fences cannot " -- "have [info " -- "strings]:\n\n" +- "have [" +- "info strings]:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -3907,197 +3967,213 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n\n" - "## HTML blocks\n\n" -- "An [HTML block](" -- "@) is a group of" -- " lines that is " +- "An " +- "[HTML block](@)" +- " is a group of " +- "lines that is " - "treated\n" - as raw HTML (and - " will not be " - "escaped in HTML " - "output).\n\n" - "There are seven " -- "kinds of [HTML " -- "block], which " -- "can be defined " -- "by their\n" +- "kinds of [" +- "HTML block]" +- ", which can be " +- defined by their +- "\n" - "start and end " - "conditions. " - The block begins - " with a line " - "that meets a\n" - "[start condition" -- "](@) (after up " -- "to three " -- "optional spaces " -- of indentation). -- "\n" +- "](@)" +- " (after up to " +- "three optional " +- "spaces of " +- "indentation).\n" - It ends with the - " first " - "subsequent line " - "that meets a " - "matching\n" - "[end condition](" -- "@), or the last " +- "@)" +- ", or the last " - "line of the " - "document, or the" - " last line of\n" -- "the [container " -- "block](#" -- container-blocks -- ) containing the -- " current HTML\n" +- "the " +- "[container block" +- "](#container-" +- blocks) +- " containing the " +- "current HTML\n" - "block, if no " - "line is " - encountered that -- " meets the [end " -- "condition]. If\n" +- " meets the [" +- "end condition]" +- ". If\n" - "the first line " - "meets both the [" - "start condition]" - " and the [end\n" -- "condition], the " -- "block will " -- "contain just " +- "condition]" +- ", the block will" +- " contain just " - "that line.\n\n" - "1. " - "**Start " -- "condition:** " -- line begins with -- " the string `<" -- "pre`,\n``, or " -- "the end of the " -- "line.\\\n" +- "string `>`" +- ", or the end of " +- "the line.\\\n" - "**End condition:" -- "** line " -- "contains an end " -- "tag\n``, " -- "``, ``, or `` (case" -- "-insensitive; it" -- "\n" +- "**" +- " line contains " +- "an end tag\n" +- "``, " +- "``, " +- "``, or " +- "``" +- " (case-" +- "insensitive; it\n" - "need not match " - "the start tag).\n" - "\n2. " - "**Start " -- "condition:** " -- line begins with -- " the string ``.\n\n" +- "**" +- " line contains " +- "the string `-->`" +- ".\n\n" - "3. " - "**Start " -- "condition:** " -- line begins with -- " the string ``" -- ".\n\n" -- "4. " +- "**" +- " line contains " +- "the string `?>`." +- "\n\n4. " - "**Start " -- "condition:** " -- line begins with -- " the string ``.\n\n" +- "**" +- " line contains " +- "the character " +- "`>`.\n\n" - "5. " - "**Start " -- "condition:** " -- line begins with -- " the string\n" +- "condition:**" +- " line begins " +- "with the string\n" - "`" -- "`.\n\n" +- "**" +- " line contains " +- "the string `]]>`" +- ".\n\n" - "6. " - "**Start " -- "condition:** " -- line begins with -- " the string `<` " -- "or ``" -- ", or\nthe string " -- "`/>`.\\\n" +- "string `>`, or\n" +- "the string `/>`." +- "\\\n" - "**End condition:" -- "** line is " +- "**" +- " line is " - "followed by a [" - "blank line].\n\n" - "7. " - "**Start " -- "condition:** " -- line begins with -- " a complete [" -- "open tag]\n" -- "(with any [tag " -- "name] other than" -- " `pre`, `script`" -- ",\n`style`, or " -- "`textarea`) or a" -- " complete [" -- "closing tag],\n" +- "condition:**" +- " line begins " +- "with a complete " +- "[open tag]\n" +- "(with any [" +- "tag name]" +- " other than " +- "`pre`, `script`," +- "\n`style`, or " +- "`textarea`" +- ") or a complete " +- "[closing tag],\n" - followed by zero - " or more spaces " - "and tabs, " @@ -4105,7 +4181,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - end of the line. - "\\\n" - "**End condition:" -- "** line is " +- "**" +- " line is " - "followed by a [" - "blank line].\n\n" - "HTML blocks " @@ -4113,17 +4190,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "they are closed " - "by their " - "appropriate\n[" -- "end condition], " -- or the last line -- " of the document" -- " or other [" -- "container\nblock" +- "end condition]" +- ", or the last " +- "line of the " +- "document or " +- "other " +- "[container\nblock" - "](#container-" - blocks) - ". " - "This means any " -- HTML **within an -- " HTML\nblock**" +- "HTML " +- "**within an HTML" +- "\nblock**" - " that might " - "otherwise be " - "recognised as a " @@ -4134,14 +4213,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "passed through " - "as-is, without " - "changing\n" -- "the parser's " -- "state.\n\n" -- "For instance, `<" -- "pre>` within an " -- "HTML block " -- "started by `<" -- "table>` will not" -- " affect\n" +- "the parser'" +- "s state.\n\n" +- "For instance, " +- "`
    `"
    +- " within an HTML "
    +- block started by
    +- " `
    `" +- " will not affect" +- "\n" - the parser state - "; as the HTML " - "block was " @@ -4175,21 +4255,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the HTML block " - is terminated by - " the blank line " -- "— the `**Hello**" -- "`\n" +- "— the " +- "`**Hello**`\n" - "text remains " - "verbatim — and " - "regular parsing " - "resumes, with a " - "paragraph,\n" -- "emphasised `" -- "world` and " -- inline and block -- " HTML following." -- "\n\n" +- "emphasised " +- "`world`" +- " and inline and " +- "block HTML " +- "following.\n\n" - "All types of [" -- "HTML blocks] " -- "except type 7 " +- "HTML blocks]" +- " except type 7 " - "may interrupt\n" - "a paragraph. " - Blocks of type 7 @@ -4345,9 +4425,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "The initial tag " -- "doesn't even " -- "need to be a " -- "valid\n" +- "doesn'" +- "t even need to " +- "be a valid\n" - "tag, as long as " - "it starts like " - "one:\n\n" @@ -4423,11 +4503,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "int x = 33;\n```\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "To start an [" -- "HTML block] with" -- " a tag that is *" -- "not* in the\n" +- "\n\n\nTo start an [" +- "HTML block]" +- " with a tag that" +- " is *not* in the" +- "\n" - list of block- - level tags in (6 - "), you must put " @@ -4486,20 +4566,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - either block- - level or inline- - "level tags.\nThe " -- "`` tag is a" -- " nice example. " +- "``" +- " tag is a nice " +- "example. " - "We can surround " - "content with\n" -- "`` tags in " -- "three different " -- "ways. " -- "In this case, we" -- " get a raw\n" +- "``" +- " tags in three " +- different ways. +- " In this case, " +- "we get a raw\n" - "HTML block, " -- "because the `<" -- "del>` tag is on " -- a line by itself -- ":\n\n" +- "because the " +- "``" +- " tag is on a " +- "line by itself:\n" +- "\n" - "````````````````" - "````````````````" - " example\n" @@ -4513,9 +4595,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " get a raw HTML " - "block that just " - "includes\nthe " -- "`` tag (" -- "because it ends " -- "with the " +- "``" +- " tag (because it" +- " ends with the " - "following blank\n" - "line). " - "So the contents " @@ -4532,12 +4614,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Finally, in this" -- " case, the `` tags are " +- " case, the " +- "``" +- " tags are " - "interpreted\nas [" -- "raw HTML] *" -- "inside* the " -- "CommonMark " +- "raw HTML] " +- "*inside*" +- " the CommonMark " - "paragraph. " - "(Because\n" - "the tag is not " @@ -4560,9 +4643,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "designed to " - "contain literal " - "content\n(`pre`, " -- "`script`, `style" -- "`, `textarea`), " -- "comments, " +- "`script`, " +- "`style`, " +- "`textarea`" +- "), comments, " - "processing " - "instructions,\n" - and declarations @@ -4868,10 +4952,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - An HTML block of -- " types 1--6 can " -- "interrupt a " -- "paragraph, and " -- "need not be\n" +- " types 1--" +- "6 can interrupt " +- "a paragraph, and" +- " need not be\n" - "preceded by a " - "blank line.\n\n" - "````````````````" @@ -4922,22 +5006,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "This rule " - "differs from " -- "John Gruber's " -- "original " +- "John Gruber'" +- "s original " - "Markdown syntax\n" - "specification, " - "which says:\n\n" -- "> The only " +- "> " +- "The only " - restrictions are - " that block-" - "level HTML " - "elements —\n> " -- "e.g. `
    `, `<" -- "table>`, `
    `"
    -- ", `

    `, etc. — " -- "must be " -- "separated from\n" -- "> surrounding " +- "e.g. `

    `, " +- "`
    `, " +- "`
    `, `

    `" +- ", etc. — must be" +- " separated from\n" +- "> " +- "surrounding " - content by blank - " lines, and the " - "start and end " @@ -4947,21 +5033,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with spaces or " - "tabs.\n\n" - "In some ways " -- "Gruber's rule is" -- " more " +- "Gruber'" +- "s rule is more " - restrictive than - " the one given\n" - "here:\n\n" -- "- It requires " -- "that an HTML " -- "block be " -- "preceded by a " -- "blank line.\n" -- "- It does not " +- "- " +- It requires that +- " an HTML block " +- be preceded by a +- " blank line.\n" +- "- " +- "It does not " - "allow the start " - "tag to be " - "indented.\n" -- "- It requires a " +- "- " +- "It requires a " - matching end tag - ", which it also " - "does not allow " @@ -4970,16 +5058,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Most Markdown " - "implementations " - "(including some " -- "of Gruber's own)" -- " do not\n" +- "of Gruber'" +- "s own) do not\n" - "respect all of " - "these " - "restrictions.\n\n" - "There is one " - "respect, however" - ", in which " -- "Gruber's rule is" -- " more liberal\n" +- "Gruber'" +- "s rule is more " +- "liberal\n" - "than the one " - "given here, " - "since it allows " @@ -5076,8 +5165,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "documents with " - 100% reliability - ". However,\n" -- "*in most cases* " -- "this will work " +- "*in most cases*" +- " this will work " - "fine, because " - "the blank lines " - "in\n" @@ -5134,54 +5223,59 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "can be\n" - "deleted. " - The exception is -- " inside `

    ` "
    -- "tags, but as "
    +- " inside `
    `"
    +- " tags, but as "
     - "described\n[above"
    -- "][HTML blocks], "
    -- "raw HTML blocks "
    -- "starting with `<"
    -- "pre>`\n*can*"
    +- "][HTML blocks]"
    +- ", raw HTML "
    +- "blocks starting "
    +- "with `
    `\n"
    +- "*can*"
     - " contain blank "
     - "lines.\n\n"
    -- "## Link "
    -- "reference "
    +- "## "
    +- "Link reference "
     - "definitions\n\n"
    -- "A [link "
    -- "reference "
    +- "A "
    +- "[link reference "
     - "definition](@)\n"
     - "consists of a ["
    -- "link label], "
    -- "optionally "
    +- "link label]"
    +- ", optionally "
     - "preceded by up "
     - "to three spaces "
     - "of\n"
     - "indentation, "
     - "followed\n"
    -- "by a colon (`:`)"
    -- ", optional "
    +- "by a colon (`:`"
    +- "), optional "
     - spaces or tabs (
     - "including up to "
     - "one\n[line ending"
    -- "]), a [link "
    -- "destination],\n"
    +- "]), a ["
    +- link destination
    +- "],\n"
     - "optional spaces "
     - or tabs (
     - "including up to "
     - "one\n[line ending"
    -- "]), and an "
    +- "]"
    +- "), and an "
     - "optional [link\n"
    -- "title], which if"
    -- " it is present "
    -- "must be "
    -- "separated\n"
    -- "from the [link "
    -- "destination] by "
    -- "spaces or tabs.\n"
    +- "title]"
    +- ", which if it is"
    +- " present must be"
    +- " separated\n"
    +- "from the ["
    +- link destination
    +- "]"
    +- " by spaces or "
    +- "tabs.\n"
     - "No further "
     - "character may "
     - "occur.\n\n"
    -- "A [link "
    -- "reference "
    +- "A ["
    +- "link reference "
     - "definition]\n"
     - "does not "
     - "correspond to a "
    @@ -5195,15 +5289,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "reference links]"
     - "\n"
     - and reference-
    -- "style [images] "
    -- elsewhere in the
    -- " document.  ["
    +- "style [images]"
    +- " elsewhere in "
    +- "the document.  ["
     - "Link\n"
     - "reference "
    -- "definitions] can"
    -- " come either "
    -- "before or after "
    -- "the links that "
    +- "definitions]"
    +- " can come either"
    +- " before or after"
    +- " the links that "
     - "use\nthem.\n\n"
     - "````````````````"
     - "````````````````"
    @@ -5320,8 +5414,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "

    [foo]

    \n" - "````````````````" - "````````````````" -- "\n\n" -- " However, an " +- "\n\n " +- "However, an " - "empty link " - "destination may " - "be specified " @@ -5410,11 +5504,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "As noted in the " - "section on [" -- "Links], matching" -- " of labels is\n" +- "Links]" +- ", matching of " +- "labels is\n" - case-insensitive -- " (see [matches])" -- ".\n\n" +- " (see [matches]" +- ").\n\n" - "````````````````" - "````````````````" - " example\n" @@ -5545,12 +5640,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    [foo]

    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "A [link " -- "reference " -- "definition] " -- cannot interrupt -- " a paragraph.\n\n" +- "\n\n\nA [" +- "link reference " +- "definition]" +- " cannot " +- "interrupt a " +- "paragraph.\n\n" - "````````````````" - "````````````````" - " example\n" @@ -5605,9 +5700,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo

    \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Several [link " -- "reference " +- "\n\n\nSeveral [" +- "link reference " - "definitions]\n" - "can occur one " - "after another, " @@ -5635,10 +5729,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Link reference " -- "definitions] can" -- " occur\n" +- "\n\n\n[" +- "Link reference " +- "definitions]" +- " can occur\n" - "inside block " - "containers, like" - " lists and block" @@ -5669,17 +5763,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "interpreted as " - "other\n" - "kinds of blocks " -- "forms a [" -- "paragraph](@).\n" +- "forms a " +- "[paragraph](@).\n" - "The contents of " - "the paragraph " - "are the result " - "of parsing the\n" -- "paragraph's raw " -- "content as " -- "inlines. " -- "The paragraph's " -- "raw content\n" +- "paragraph'" +- s raw content as +- " inlines. " +- "The paragraph'" +- "s raw content\n" - "is formed by " - "concatenating " - "the lines and " @@ -5794,8 +5888,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that ends with " - "two or more " - "spaces will not " -- "end with a [hard" -- " line\nbreak]:\n\n" +- "end with a [" +- "hard line\nbreak]" +- ":\n\n" - "````````````````" - "````````````````" - " example\n" @@ -5807,8 +5902,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "## Blank lines\n\n" -- "[Blank lines] " -- between block- +- "[Blank lines]" +- " between block-" - "level elements " - "are ignored,\n" - "except for the " @@ -5832,13 +5927,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n\n" -- "# Container " -- "blocks\n\n" -- "A [container " -- "block](#" -- container-blocks -- ") is a block " -- "that has other\n" +- "# " +- Container blocks +- "\n\nA " +- "[container block" +- "](#container-" +- blocks) +- " is a block that" +- " has other\n" - "blocks as its " - "contents. " - "There are two " @@ -5857,7 +5953,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The general\n" - "form of the " - "definition is:\n\n" -- "> If X is a " +- "> " +- "If X is a " - "sequence of " - "blocks, then the" - " result of\n> " @@ -5875,8 +5972,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "list item by " - "explaining\n" - how these can be -- " *generated* " -- "from their " +- " *generated*" +- " from their " - "contents. " - "This should " - "suffice\n" @@ -5884,8 +5981,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "syntax, although" - " it does not " - "give a recipe " -- for *parsing* -- "\n" +- "for *parsing*\n" - "these " - "constructions. " - "(A recipe is " @@ -5898,73 +5994,80 @@ input_file: tests/inputs/markdown/commonmark_spec.md - parsing-strategy - ").)\n\n" - "## Block quotes" -- "\n\n" -- "A [block quote " +- "\n\nA " +- "[block quote " - "marker](@),\n" - "optionally " - "preceded by up " - "to three spaces " - "of indentation,\n" - "consists of (a) " -- "the character `>" -- "` together with " -- "a following " -- "space of\n" +- "the character " +- "`>`" +- " together with a" +- " following space" +- " of\n" - "indentation, or " - "(b) a single " -- "character `>` " -- "not followed by " -- "a space of\n" +- "character `>`" +- " not followed by" +- " a space of\n" - "indentation.\n\n" - "The following " - "rules define [" - "block quotes]:\n\n" - "1. " -- "**Basic case." -- "** If a string " -- "of lines *Ls* " -- "constitute a " +- "**Basic case.**" +- " If a string of" +- " lines *Ls*" +- " constitute a " - "sequence\n " -- "of blocks *Bs*, " -- "then the result " -- "of prepending a " -- "[block quote" -- "\n marker]" +- of blocks *Bs* +- ", then the " +- "result of " +- "prepending a [" +- "block quote\n " +- "marker]" - " to the " - "beginning of " -- each line in *Ls -- "*\n is a " +- "each line in " +- "*Ls*\n is a " - "[block quote](#" -- "block-quotes) " -- containing *Bs*. -- "\n\n2. " -- "**Laziness." -- "** If a string " -- "of lines *Ls* " -- "constitute a [" -- "block\n quote" +- block-quotes) +- " containing *Bs*" +- ".\n\n" +- "2. " +- "**Laziness.**" +- " If a string of" +- " lines *Ls*" +- " constitute a " +- "[block\n quote" - "](#block-quotes)" -- " with contents *" -- "Bs*, then the " +- " with contents " +- "*Bs*" +- ", then the " - "result of " - "deleting\n " - "the initial [" - "block quote " -- "marker] from one" -- " or\n " +- "marker]" +- " from one or" +- "\n " - "more lines in " - "which the next " - "character other " - "than a space or " - "tab after the\n" -- " [block quote" -- " marker] is [" +- " [" +- "block quote " +- "marker] is [" - "paragraph " - "continuation\n" -- " text] is a " -- block quote with -- " *Bs* as its " -- "content.\n " +- " text]" +- " is a block " +- quote with *Bs* +- " as its content." +- "\n " - "[Paragraph " - "continuation " - "text](@) is text" @@ -5980,16 +6083,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "3. " - "**" - Consecutiveness. -- "** A document " +- "**" +- " A document " - "cannot contain " - "two [block\n " -- "quotes] in a row" -- " unless there is" -- " a [blank line] " -- "between them.\n\n" +- "quotes]" +- " in a row unless" +- " there is a [" +- "blank line]" +- " between them.\n\n" - "Nothing else " -- "counts as a [" -- "block quote](#" +- "counts as a " +- "[block quote](#" - "block-quotes).\n\n" - Here is a simple - " example:\n\n" @@ -6006,8 +6111,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - The space or tab -- " after the `>` " -- "characters can " +- " after the `>`" +- " characters can " - "be omitted:\n\n" - "````````````````" - "````````````````" @@ -6020,9 +6125,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "The `>` " -- "characters can " +- "\n\n\nThe `>`" +- " characters can " - "be preceded by " - "up to three " - "spaces of " @@ -6103,7 +6207,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "block quote " - "markers].\n" - "For example, the" -- " `> ` cannot be " +- " `> `" +- " cannot be " - "omitted in the " - "second line of\n\n" - "``` markdown\n" @@ -6123,8 +6228,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Similarly, if we" -- " omit the `> ` " -- "in the second " +- " omit the `> `" +- " in the second " - "line of\n\n" - "``` markdown\n" - "> - foo\n> - bar\n" @@ -6149,9 +6254,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "For the same " -- "reason, we can't" -- " omit the `> ` " -- "in front of\n" +- "reason, we can'" +- "t omit the `> `" +- " in front of\n" - subsequent lines - " of an indented " - "or fenced code " @@ -6206,11 +6311,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```markdown\n" - "> foo\n" - "> - bar\n```\n" -- "\n" -- "the `- bar` is " -- indented too far -- " to start a list" -- ", and can't\n" +- "\nthe `- bar`" +- " is indented too" +- " far to start a " +- "list, and can't\n" - "be an indented " - "code block " - because indented @@ -6218,8 +6322,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "cannot\n" - "interrupt " - "paragraphs, so " -- "it is [paragraph" -- " continuation " +- "it is [" +- "paragraph " +- "continuation " - "text].\n\n" - "A block quote " - "can be empty:\n\n" @@ -6276,11 +6381,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "implementations," - " including John " - "Gruber's\n" -- "original `" -- "Markdown.pl`, " -- "will parse this " -- "example as a " -- "single block " +- "original " +- "`Markdown.pl`" +- ", will parse " +- "this example as " +- "a single block " - "quote\n" - "with two " - "paragraphs. " @@ -6401,9 +6506,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the Laziness " - "rule that any " - "number\n" -- "of initial `>`s " -- "may be omitted " -- "on a " +- "of initial `>`" +- s may be omitted +- " on a " - "continuation " - "line of a\n" - "nested block " @@ -6443,12 +6548,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " block in a " - "block quote,\n" - "remember that " -- "the [block quote" -- " marker] " -- "includes\n" -- "both the `>` and" -- " a following " -- "space of " +- "the [" +- "block quote " +- "marker] includes" +- "\nboth the `>`" +- " and a following" +- " space of " - "indentation. " - So *five spaces* - " are needed\n" @@ -6469,22 +6574,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n\n" - "## List items\n\n" -- "A [list marker](" -- "@) is a\n[" +- "A " +- "[list marker](@)" +- " is a\n[" - "bullet list " - "marker] or an [" - "ordered list " - "marker].\n\n" -- "A [bullet list " +- "A " +- "[bullet list " - "marker](@)\nis a " - "`-`, `+`, or `*`" - " character.\n\n" -- "An [ordered list" -- " marker](@)\n" +- "An " +- "[ordered list " +- "marker](@)\n" - is a sequence of -- " 1--9 arabic " -- "digits (`0-9`), " -- "followed by " +- " 1--" +- "9 arabic digits " +- "(`0-9`" +- "), followed by " - "either a\n`.`" - " character or a " - "`)`" @@ -6502,38 +6611,41 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "rules define [" - "list items]:\n\n" - "1. " -- "**Basic case." -- "** If a " -- "sequence of " -- "lines *Ls* " -- "constitute a " +- "**Basic case.**" +- " If a sequence " +- of lines *Ls* +- " constitute a " - "sequence of\n " -- "blocks *Bs* " -- "starting with a " -- "character other " -- "than a space or " -- "tab, and *M*" -- " is\n " +- blocks *Bs* +- " starting with a" +- " character other" +- " than a space or" +- " tab, and *M* is" +- "\n " - a list marker of -- " width *W* " -- "followed by 1 ≤ " -- "*N* ≤ 4 spaces " -- "of indentation,\n" -- " then the " -- "result of " -- "prepending *M* " -- "and the " +- " width *W*" +- " followed by 1 ≤" +- " *N*" +- " ≤ 4 spaces of " +- "indentation,\n" +- " " +- "then the result " +- "of prepending " +- "*M*" +- " and the " - following spaces - " to the first " - "line\n of *Ls*" - ", and indenting " - subsequent lines -- " of *Ls* by *W +" -- " N* spaces, is a" +- " of *Ls* by " +- "*W + N*" +- " spaces, is a" - "\n " -- list item with * -- "Bs* as its " -- "contents. " +- "list item with " +- "*Bs*" +- " as its contents" +- ". " - "The type of the " - "list item\n " - "(bullet or " @@ -6541,11 +6653,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "determined by " - "the type of its " - "list marker.\n" -- " If the list " -- "item is ordered," -- " then it is also" -- " assigned a " -- "start\n " +- " " +- If the list item +- " is ordered, " +- "then it is also " +- assigned a start +- "\n " - "number, based on" - " the ordered " - "list marker.\n\n" @@ -6559,14 +6672,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that is, when it" - " starts on a " - "line that would\n" -- " otherwise" -- " count as [" +- " " +- "otherwise count " +- "as [" - "paragraph " - "continuation " - "text]---then (a)" - "\n " -- "the lines *Ls* " -- "must not begin " +- the lines *Ls* +- " must not begin " - "with a blank " - "line, and (b) if" - "\n " @@ -6577,14 +6691,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "2. " - If any line is a - " [thematic break" -- "][thematic " -- "breaks] then" -- "\n " +- "][" +- "thematic breaks]" +- " then\n " - that line is not - " a list item.\n\n" - "For example, let" -- " *Ls* be the " -- "lines\n\n" +- " *Ls*" +- " be the lines\n\n" - "````````````````" - "````````````````" - " example\n" @@ -6607,10 +6721,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "And let *M* be " -- "the marker `1.`," -- " and *N*" +- "\n\n\nAnd let *M*" +- " be the marker " +- "`1.`, and *N*" - " = 2. " - "Then rule #1 " - "says\n" @@ -6781,13 +6894,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Here `two` " -- "occurs in the " +- "\n\n\nHere `two`" +- " occurs in the " - "same column as " - "the list marker " -- "`1.`" -- ",\n" +- "`1.`,\n" - "but is actually " - contained in the - " list item, " @@ -6803,14 +6914,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "also possible. " - In the following - " example, the " -- "word `two`" -- "\n" +- "word `two`\n" - "occurs far to " - the right of the - " initial text of" - " the list item, " -- "`one`" -- ", but\n" +- "`one`, but\n" - "it is not " - "considered part " - of the list item @@ -6988,36 +7097,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n " - "constitute a " - "sequence of " -- "blocks *Bs* " -- starting with an -- " indented code\n" -- " block, and *" -- "M* is a list " +- blocks *Bs* +- " starting with " +- an indented code +- "\n block, and " +- "*M*" +- " is a list " - "marker of width " -- "*W*" -- " followed by" +- "*W* followed by" - "\n " - "one space of " - "indentation, " - "then the result " -- of prepending *M -- "* and the\n " +- "of prepending " +- "*M* and the\n " - "following space " - "to the first " -- "line of *Ls*, " -- "and indenting " +- line of *Ls* +- ", and indenting " - subsequent lines - "\n of *Ls* by " -- "*W + 1* spaces, " -- "is a list item " -- with *Bs* as its -- " contents.\n " +- "*W + 1*" +- " spaces, is a " +- "list item with " +- "*Bs*" +- " as its contents" +- ".\n " - "If a line is " - "empty, then it " - "need not be " - "indented. " - "The type of the\n" -- " list item (" +- " " +- list item ( - "bullet or " - "ordered) is " - "determined by " @@ -7074,17 +7186,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "If the *first* " -- "block in the " +- "\n\n\nIf the " +- "*first*" +- " block in the " - "list item is an " - "indented code " - "block,\n" - "then by rule #2," - " the contents " - must be preceded -- " by *one* space " -- "of indentation\n" +- " by *one*" +- " space of " +- "indentation\n" - "after the list " - "marker:\n\n" - "````````````````" @@ -7241,36 +7354,41 @@ input_file: tests/inputs/markdown/commonmark_spec.md - of lines *Ls* - "\n " - "starting with a " -- "single [blank " -- "line] constitute" -- " a (possibly " -- "empty)\n " +- "single [" +- "blank line]" +- " constitute a (" +- "possibly empty)\n" +- " " - "sequence of " -- "blocks *Bs*, and" -- " *M* is a list " +- blocks *Bs* +- ", and *M*" +- " is a list " - "marker of width " -- "*W*" -- ",\n " +- "*W*,\n " - "then the result " -- of prepending *M -- "* to the first " -- "line of *Ls*, " -- "and\n " +- "of prepending " +- "*M*" +- " to the first " +- line of *Ls* +- ", and\n " - "preceding " - subsequent lines -- " of *Ls* by *W +" -- " 1* spaces of " +- " of *Ls* by " +- "*W + 1*" +- " spaces of " - "indentation, is " - "a\n " -- list item with * -- "Bs* as its " -- "contents.\n " +- "list item with " +- "*Bs*" +- " as its contents" +- ".\n " - "If a line is " - "empty, then it " - "need not be " - "indented. " - "The type of the\n" -- " list item (" +- " " +- list item ( - "bullet or " - "ordered) is " - "determined by " @@ -7315,8 +7433,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces\n" - "following the " - "list marker " -- "doesn't change " -- "the required " +- "doesn'" +- "t change the " +- "required " - "indentation:\n\n" - "````````````````" - "````````````````" @@ -7333,8 +7452,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "most one blank " - "line.\n" - In the following -- " example, `foo` " -- "is not part of " +- " example, `foo`" +- " is not part of " - "the list\nitem:\n\n" - "````````````````" - "````````````````" @@ -7420,8 +7539,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n4. " - "**Indentation.**" - " If a sequence " -- "of lines *Ls* " -- "constitutes a " +- of lines *Ls* +- " constitutes a " - "list item\n " - "according to " - "rule #1, #2, or " @@ -7440,8 +7559,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "contents and " - "attributes. " - "If a line is\n" -- " empty, then " -- "it need not be " +- " " +- "empty, then it " +- "need not be " - "indented.\n\n" - "Indented one " - "space:\n\n" @@ -7555,12 +7675,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n\n5. " - "**Laziness.**" - " If a string of" -- " lines *Ls* " -- "constitute a [" -- "list\n item" -- "](#list-items) " -- with contents * -- "Bs*, then the " +- " lines *Ls*" +- " constitute a " +- "[list\n item" +- "](#list-items)" +- " with contents " +- "*Bs*" +- ", then the " - "result of " - "deleting\n " - "some or all of " @@ -7573,7 +7694,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "space or tab " - "after the " - "indentation is\n" -- " [paragraph " +- " [" +- "paragraph " - "continuation " - "text] is a\n " - "list item with " @@ -7581,8 +7703,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "contents and " - "attributes. " - "The unindented\n" -- " lines are " -- "called\n " +- " " +- lines are called +- "\n " - "[lazy " - "continuation " - "line](@)s.\n\n" @@ -7680,9 +7803,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " not counted as " - "a list item by " - "rules\n #1--" -- "5 counts as a [" -- "list item](#list" -- "-items).\n\n" +- "5 counts as a " +- "[list item](#" +- "list-items).\n\n" - "The rules for " - "sublists follow " - from the general @@ -7815,13 +7938,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "### Motivation\n\n" -- "John Gruber's " -- "Markdown spec " +- "John Gruber'" +- "s Markdown spec " - "says the " - "following about " - "list items:\n\n" - "1. " -- "\"List markers " +- "\"" +- "List markers " - "typically start " - "at the left " - "margin, but may " @@ -7831,31 +7955,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "List markers " - must be followed - " by one or more\n" -- " spaces or a " -- "tab.\"\n\n" +- " " +- spaces or a tab. +- "\"\n\n" - "2. " -- "\"To make lists " +- "\"" +- "To make lists " - "look nice, you " - "can wrap items " - "with hanging " - "indents....\n " -- "But if you don't" -- " want to, you " +- "But if you don'" +- "t want to, you " - "don't have to.\"" - "\n\n3. " -- "\"List items may " +- "\"" +- "List items may " - "consist of " - "multiple " - "paragraphs. " - "Each subsequent\n" -- " paragraph in " -- a list item must -- " be indented by " +- " " +- "paragraph in a " +- "list item must " +- "be indented by " - "either 4 spaces " - "or one\n tab.\"" - "\n\n4. " -- "\"It looks nice " -- "if you indent " +- "\"" +- It looks nice if +- " you indent " - "every line of " - "the subsequent " - "paragraphs,\n " @@ -7864,7 +7993,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "allow you to be " - "lazy.\"\n\n" - "5. " -- "\"To put a " +- "\"" +- "To put a " - "blockquote " - "within a list " - "item, the " @@ -7873,7 +8003,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiters need " - "to be indented.\"" - "\n\n6. " -- "\"To put a code " +- "\"" +- "To put a code " - "block within a " - "list item, the " - code block needs @@ -7934,16 +8065,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "and principled, " - "and if the " - "reference\n" -- "implementation `" -- "Markdown.pl` had" -- " followed it, it" -- " probably would " -- "have\n" +- "implementation " +- "`Markdown.pl`" +- " had followed it" +- ", it probably " +- "would have\n" - "become the " - "standard. " -- "However, `" -- "Markdown.pl` " -- "allowed " +- "However, " +- "`Markdown.pl`" +- " allowed " - "paragraphs and\n" - "sublists to " - "start with only " @@ -7980,7 +8111,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "for example, " - "stuck with " -- "Gruber's syntax " +- "Gruber'" +- "s syntax " - "description and " - "the four-space\n" - "rule, while " @@ -7989,8 +8121,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "marked, PHP " - "Markdown, and " - "others\nfollowed " -- "`Markdown.pl`'s " -- "behavior more " +- "`Markdown.pl`'" +- "s behavior more " - "closely.)\n\n" - "Unfortunately, " - "given the " @@ -8015,9 +8147,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " four-space rule" - " or\n" - "the more " -- "forgiving `" -- "Markdown.pl` " -- "behavior, " +- "forgiving " +- "`Markdown.pl`" +- " behavior, " - "provided they " - "are laid out\n" - in a way that is @@ -8078,7 +8210,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with an " - "intervening " - "paragraph,\n\n" -- "``` html\n
      \n" +- "``` html\n" +- "
        \n" - "
      • foo
      • \n" - "
      \n" - "

      bar

      \n
        \n" @@ -8089,8 +8222,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "demands, rather " - "than a single " - "list,\n\n" -- "``` html\n
          \n" -- "
        • \n

          foo

          \n" +- "``` html\n" +- "
            \n
          • \n" +- "

            foo

            \n" - "

            bar

            \n
              \n" - "
            • baz
            • \n" - "
            \n
          • \n" @@ -8119,24 +8253,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the initial list - " marker, allows " - "text that is " -- "indented *less " -- "than* the\n" +- "indented " +- "*less than* the\n" - "original list " - "marker to be " - "included in the " - "list item. " - "For example,\n" -- "`Markdown.pl` " -- "parses\n\n" +- "`Markdown.pl`" +- " parses\n\n" - "``` markdown\n" - " - one\n\n two\n" - "```\n\n" - as a single list -- " item, with `two" -- "` a continuation" -- " paragraph:\n\n" -- "``` html\n
              \n" -- "
            • \n

              one

              \n" +- " item, with " +- "`two`" +- " a continuation " +- "paragraph:\n\n" +- "``` html\n" +- "
                \n
              • \n" +- "

                one

                \n" - "

                two

                \n" - "
              • \n
              \n```\n" - "\nand similarly\n" @@ -8180,8 +8316,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "with a " - "subparagraph, " - "even though the " -- "paragraph `bar`" -- "\n" +- "paragraph `bar`\n" - "is not indented " - "as far as the " - "first paragraph " @@ -8192,8 +8327,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Arguably this " - "text does read " - like a list item -- " with `bar` as a" -- " subparagraph,\n" +- " with `bar`" +- " as a " +- "subparagraph,\n" - "which may count " - "in favor of the " - "proposal. " @@ -8222,18 +8358,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "parse this text " - "as expected, " - "since the code " -- "block's " -- "indentation is " -- "measured\n" +- "block'" +- s indentation is +- " measured\n" - "from the " -- "beginning of `" -- "foo`.\n\n" +- "beginning of " +- "`foo`.\n\n" - "The one case " - "that needs " - "special " - "treatment is a " -- list item that * -- "starts*\n" +- "list item that " +- "*starts*\n" - "with indented " - "code. " - "How much " @@ -8242,8 +8378,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " case, since\n" - "we don't have a " - "\"first paragraph" -- "\" to measure " -- "from? " +- "\"" +- " to measure from" +- "? " - "Rule #2 simply " - "stipulates\n" - "that in such " @@ -8270,24 +8407,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - diverge in other - " cases.\n\n" - "## Lists\n\n" -- "A [list](@) is a" -- " sequence of one" -- " or more\n" -- "list items [of " -- "the same type]." -- " The list items" -- "\n" +- "A [list](@)" +- " is a sequence " +- "of one or more\n" +- "list items [" +- of the same type +- "]" +- ". " +- "The list items\n" - may be separated - " by any number " - "of blank lines.\n" - "\n" - "Two list items " -- "are [of the same" -- " type](@)\n" +- "are " +- "[of the same " +- "type](@)\n" - "if they begin " -- "with a [list " -- "marker] of the " -- "same type.\n" +- "with a [" +- "list marker]" +- " of the same " +- "type.\n" - Two list markers - " are of the\n" - same type if (a) @@ -8295,17 +8435,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " list markers " - "using the same " - "character\n(`-`, " -- "`+`, or `*`) or " -- "(b) they are " -- "ordered list " -- numbers with the -- " same\n" +- "`+`, or `*`" +- ") or (b) they " +- are ordered list +- " numbers with " +- "the same\n" - delimiter ( -- "either `.` or `)" -- "`).\n\n" -- "A list is an [" -- "ordered list](@)" -- "\n" +- "either `.` or " +- "`)`).\n\n" +- "A list is an " +- "[ordered list](@" +- ")\n" - "if its " - constituent list - " items begin " @@ -8317,24 +8457,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - constituent list - "\n" - items begin with -- " [bullet list " +- " [" +- "bullet list " - "markers].\n\n" -- "The [start " -- "number](@)\n" -- "of an [ordered " -- "list] is " -- "determined by " -- "the list number " -- "of\n" +- "The " +- "[start number](@" +- ")\nof an [" +- "ordered list]" +- " is determined " +- "by the list " +- "number of\n" - its initial list - " item. " - "The numbers of " - "subsequent list " - "items are\n" - "disregarded.\n\n" -- "A list is [loose" -- "](@) if any of " -- "its constituent\n" +- "A list is " +- "[loose](@)" +- " if any of its " +- "constituent\n" - "list items are " - "separated by " - "blank lines, or " @@ -8347,8 +8489,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " blank line\n" - "between them. " - Otherwise a list -- " is [tight](@)" -- ".\n" +- " is [tight](@).\n" - "(The difference " - "in HTML output " - "is that " @@ -8413,9 +8554,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
            \n" - "````````````````" - "````````````````" -- "\n\n" -- "`Markdown.pl` " -- "does not allow " +- "\n\n`Markdown.pl`" +- " does not allow " - "this, through " - "fear of " - "triggering a " @@ -8429,10 +8569,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "house is\n14. " - "The number of " - "doors is 6.\n```\n" -- "\n" -- "Oddly, though, `" -- "Markdown.pl` *" -- "does* allow a " +- "\nOddly, though, " +- "`Markdown.pl` " +- "*does*" +- " allow a " - "blockquote to\n" - "interrupt a " - "paragraph, even " @@ -8460,7 +8600,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```\n\n" - "Second, we are " - "attracted to a\n\n" -- "> [principle of " +- "> " +- "[principle of " - "uniformity](@):" - "\n> " - "if a chunk of " @@ -8476,10 +8617,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "item or " - "blockquote).\n\n" - "(Indeed, the " -- "spec for [list " -- "items] and [" -- "block quotes] " -- "presupposes\n" +- "spec for [" +- "list items] and " +- "[block quotes]" +- " presupposes\n" - this principle.) - " This principle " - "implies that if\n" @@ -8501,10 +8642,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "though the " - "paragraph\n" - "may be rendered " -- "without `

            ` " -- "tags, since the " -- "list is \"tight\")" -- ",\nthen\n\n" +- "without `

            `" +- " tags, since the" +- " list is \"tight\"" +- "),\nthen\n\n" - "``` markdown\n" - "I need to buy\n" - "- new shoes\n" @@ -8526,12 +8667,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "inside list " - "items, the [" - "principle of\n" -- "uniformity] " -- "requires us to " +- "uniformity]" +- " requires us to " - "allow this " - "outside list " - "items as\n" -- "well. ([" +- well. ( +- "[" - reStructuredText - "](https://" - docutils.sourcef @@ -8555,8 +8697,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "hard-wrapped " - "numerals, we " - allow only lists -- " starting with `" -- "1` to\n" +- " starting with " +- "`1` to\n" - "interrupt " - "paragraphs. " - "Thus,\n\n" @@ -8740,9 +8882,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " than\n" - "three spaces of " - "indentation. " -- "Here `- e` is " -- "treated as a " -- "paragraph " +- "Here `- e`" +- " is treated as a" +- " paragraph " - "continuation\n" - "line, because it" - " is indented " @@ -8761,8 +8903,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

          \n" - "````````````````" - "````````````````" -- "\n\n" -- "And here, `3. c`" +- "\n\nAnd here, " +- "`3. c`" - " is treated as " - in indented code - " block,\n" @@ -9051,21 +9193,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code>lo`

          \n" - "````````````````" - "````````````````" -- "\n\n" -- "`hi` is parsed " -- "as code, leaving" -- " the backtick at" -- " the end as a " +- "\n\n`hi`" +- " is parsed as " +- "code, leaving " +- "the backtick at " +- "the end as a " - "literal\n" - "backtick.\n\n\n\n" - "## Code spans\n\n" -- "A [backtick " -- "string](@)\n" +- "A " +- "[backtick string" +- "](@)\n" - "is a string of " - "one or more " - "backtick " -- "characters (`` `" -- " ``) that is " +- characters ( +- "`` ` ``" +- ") that is " - "neither\n" - "preceded nor " - "followed by a " @@ -9087,23 +9231,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "normalized in " - "the\n" - "following ways:" -- "\n\n" -- "- First, [line " -- "endings] are " -- "converted to [" -- "spaces].\n" -- "- If the " -- resulting string -- " both begins *" -- and* ends with a -- " [space]\n " +- "\n\n- First, [" +- "line endings]" +- " are converted " +- "to [spaces].\n" +- "- " +- If the resulting +- " string both " +- begins *and* +- " ends with a [" +- "space]\n " - "character, but " - does not consist - " entirely of [" - "space]\n " - "characters, a " -- "single [space] " -- "character is " +- "single [space]" +- " character is " - removed from the - "\n " - front and back. @@ -9170,8 +9314,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n" -- Note that only * -- "one* space is " +- "Note that only " +- "*one*" +- " space is " - "stripped:\n\n" - "````````````````" - "````````````````" @@ -9196,11 +9341,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code>

          \n" - "````````````````" - "````````````````" -- "\n\n" -- "Only [spaces], " -- "and not [unicode" -- " whitespace] in " -- "general, are\n" +- "\n\nOnly [spaces]" +- ", and not [" +- "unicode " +- "whitespace]" +- " in general, are" +- "\n" - stripped in this - " way:\n\n" - "````````````````" @@ -9227,10 +9373,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/p>\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Line endings] " -- are treated like -- " spaces:\n\n" +- "\n\n\n[Line endings" +- "]" +- " are treated " +- "like spaces:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -9271,16 +9417,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "collapse " - "consecutive " - "spaces\n" -- "when rendering `" -- "` elements" -- ", so it is " -- recommended that -- "\n" +- "when rendering " +- "``" +- " elements, so it" +- " is recommended " +- "that\n" - "the following " - "CSS be used:\n\n" -- " code{white-" -- "space: pre-wrap;" -- "}\n\n\n" +- " " +- "code{white-space" +- ": pre-wrap;}\n\n\n" - "Note that " - "backslash " - "escapes do not " @@ -9303,16 +9449,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "never needed, " - "because one can " - "always choose a\n" -- "string of *n* " -- "backtick " +- string of *n* +- " backtick " - "characters as " - "delimiters, " - "where the code " - "does\n" - "not contain any " - "strings of " -- "exactly *n* " -- "backtick " +- exactly *n* +- " backtick " - "characters.\n\n" - "````````````````" - "````````````````" @@ -9348,9 +9494,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "not parsed as " - "emphasized text," - " since the " -- "second `*` is " -- "part of a code\n" -- "span:\n\n" +- "second `*`" +- " is part of a " +- "code\nspan:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -9477,36 +9623,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Emphasis and " +- "## " +- "Emphasis and " - "strong emphasis\n" -- "\n" -- "John Gruber's " -- "original [" -- "Markdown syntax\n" -- "description](" -- "https://" +- "\nJohn Gruber'" +- "s original " +- "[Markdown syntax" +- "\ndescription" +- "](https://" - daringfireball.n - et/projects/ - "markdown/syntax#" - "em) says:\n\n" -- "> Markdown " -- treats asterisks -- " (`*`) and " +- "> " +- "Markdown treats " +- "asterisks (`*`" +- ") and " - "underscores (`_`" - ") as indicators " - "of\n> " - "emphasis. " - "Text wrapped " - "with one `*` or " -- "`_` will be " -- "wrapped with an " -- "HTML\n> ``" +- "`_`" +- " will be wrapped" +- " with an HTML\n> " +- "``" - " tag; double `*`" -- "'s or `_`'s will" -- " be wrapped with" -- " an HTML `<" -- "strong>`\n> tag." -- "\n\n" +- "'s or `_`'" +- "s will be " +- "wrapped with an " +- "HTML ``" +- "\n> tag.\n\n" - "This is enough " - "for most users, " - "but these rules " @@ -9516,11 +9664,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "it comes to " - nested emphasis. - " The original\n" -- "`Markdown.pl` " -- test suite makes -- " it clear that " -- "triple `***` and" -- "\n`___`" +- "`Markdown.pl`" +- " test suite " +- "makes it clear " +- "that triple " +- "`***` and\n`___`" - " delimiters can " - "be used for " - "strong emphasis," @@ -9564,8 +9712,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "restricted " - "intraword " - "emphasis to\nthe " -- "`*` forms, to " -- "avoid unwanted " +- "`*`" +- " forms, to avoid" +- " unwanted " - "emphasis in " - words containing - "\n" @@ -9598,44 +9747,48 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[delimiter run](" - "@) is either\n" - "a sequence of " -- "one or more `*` " -- "characters that " -- "is not preceded " -- "or\n" +- "one or more `*`" +- " characters that" +- " is not preceded" +- " or\n" - "followed by a " - non-backslash- -- "escaped `*` " -- "character, or a " -- "sequence\n" -- "of one or more `" -- "_` characters " -- "that is not " -- "preceded or " -- "followed by\n" +- "escaped `*`" +- " character, or a" +- " sequence\n" +- "of one or more " +- "`_`" +- " characters that" +- " is not preceded" +- " or followed by\n" - a non-backslash- -- "escaped `_` " -- "character.\n\n" -- "A [left-flanking" -- " delimiter run](" -- "@) is\na [" -- "delimiter run] " -- "that is (1) not " -- "followed by [" +- "escaped `_`" +- " character.\n\n" +- "A " +- "[left-flanking " +- "delimiter run](@" +- ") is\na [" +- "delimiter run]" +- " that is (1) not" +- " followed by [" - "Unicode " - "whitespace],\n" - "and either (2a) " - "not followed by " -- "a [Unicode " +- "a [" +- "Unicode " - "punctuation " - "character], or\n" - (2b) followed by -- " a [Unicode " +- " a [" +- "Unicode " - "punctuation " - "character] and\n" - "preceded by [" - "Unicode " -- "whitespace] or a" -- " [Unicode " +- "whitespace]" +- " or a [" +- "Unicode " - "punctuation " - "character].\n" - "For purposes of " @@ -9645,28 +9798,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the line count " - "as Unicode " - "whitespace.\n\n" -- "A [right-" -- "flanking " +- "A " +- "[right-flanking " - "delimiter run](@" - ") is\na [" -- "delimiter run] " -- "that is (1) not " -- "preceded by [" +- "delimiter run]" +- " that is (1) not" +- " preceded by [" - "Unicode " - "whitespace],\n" - "and either (2a) " - "not preceded by " -- "a [Unicode " +- "a [" +- "Unicode " - "punctuation " - "character], or\n" - (2b) preceded by -- " a [Unicode " +- " a [" +- "Unicode " - "punctuation " - "character] and\n" - "followed by [" - "Unicode " -- "whitespace] or a" -- " [Unicode " +- "whitespace]" +- " or a [" +- "Unicode " - "punctuation " - "character].\n" - "For purposes of " @@ -9679,38 +9835,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Here are some " - "examples of " - "delimiter runs.\n" -- "\n" -- " - left-" -- flanking but not -- " right-flanking:" -- "\n\n ```\n" -- " ***abc\n " -- " _abc\n " +- "\n - " +- "left-flanking " +- but not right- +- "flanking:\n\n " +- "```\n ***abc\n" +- " _abc\n " - "**\"abc\"\n " - " _\"abc\"\n" - " ```\n\n" - " - right-" - flanking but not - " left-flanking:\n" -- "\n ```\n" -- " abc***\n " +- "\n ```\n " +- " abc***\n " - " abc_\n " - "\"abc\"**\n " - "\"abc\"_\n ```\n\n" - " - Both left " - and right- - "flanking:\n\n " -- "```\n" -- " abc***def\n" -- " \"abc\"_\"def\"\n" +- "```\n " +- " abc***def\n " +- "\"abc\"_\"def\"\n" - " ```\n\n" - " - Neither left" - " nor right-" - "flanking:\n\n " -- "```\n" -- " abc *** def\n" -- " a _ b\n" -- " ```\n\n" +- "```\n " +- "abc *** def\n " +- "a _ b\n ```\n\n" - "(The idea of " - "distinguishing " - "left-flanking " @@ -9722,8 +9876,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " and the " - "character\n" - after comes from -- " Roopesh " -- "Chander's\n" +- " Roopesh Chander" +- "'s\n" - "[vfmd](https://" - web.archive.org/ - web/ @@ -9755,22 +9909,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "emphasis and " - "strong emphasis:" - "\n\n1. " -- "A single `*` " -- "character [can " -- "open emphasis](@" -- ")\n " +- "A single `*`" +- " character " +- "[can open " +- "emphasis](@)\n" +- " " - iff (if and only - " if) it is part " -- "of a [left-" -- "flanking " +- "of a [" +- "left-flanking " - "delimiter run]." - "\n\n2. " -- "A single `_` " -- "character [can " -- "open emphasis] " -- "iff\n " +- "A single `_`" +- " character [" +- "can open " +- "emphasis] iff" +- "\n " - "it is part of a " -- "[left-flanking " +- "[" +- "left-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9779,29 +9936,33 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [right-flanking" -- " delimiter run]" +- " [" +- "right-flanking " +- "delimiter run]" - "\n " - "preceded by a [" - "Unicode " - "punctuation " - "character].\n\n" - "3. " -- "A single `*` " -- "character [can " -- "close emphasis](" -- "@)\n " +- "A single `*`" +- " character " +- "[can close " +- "emphasis](@)\n" +- " " - "iff it is part " -- "of a [right-" -- "flanking " +- "of a [" +- "right-flanking " - "delimiter run]." - "\n\n4. " -- "A single `_` " -- "character [can " -- "close emphasis] " -- "iff\n " +- "A single `_`" +- " character [" +- "can close " +- "emphasis] iff" +- "\n " - "it is part of a " -- "[right-flanking " +- "[" +- "right-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9810,7 +9971,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [left-flanking " +- " [" +- "left-flanking " - "delimiter run]" - "\n " - "followed by a [" @@ -9818,12 +9980,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "punctuation " - "character].\n\n" - "5. " -- "A double `**` [" -- "can open strong " -- "emphasis](@)\n" -- " iff it is " -- "part of a [left-" -- "flanking " +- "A double `**` " +- "[can open strong" +- " emphasis](@)\n" +- " " +- "iff it is part " +- "of a [" +- "left-flanking " - "delimiter run]." - "\n\n6. " - "A double `__` [" @@ -9831,7 +9994,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "emphasis] iff" - "\n " - "it is part of a " -- "[left-flanking " +- "[" +- "left-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9840,20 +10004,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [right-flanking" -- " delimiter run]" +- " [" +- "right-flanking " +- "delimiter run]" - "\n " - "preceded by a [" - "Unicode " - "punctuation " - "character].\n\n" - "7. " -- "A double `**` [" -- can close strong -- " emphasis](@)\n" -- " iff it is " -- "part of a [right" -- "-flanking " +- "A double `**` " +- "[can close " +- "strong emphasis]" +- "(@)\n " +- "iff it is part " +- "of a [" +- "right-flanking " - "delimiter run]." - "\n\n8. " - "A double `__` [" @@ -9861,7 +10027,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " emphasis] iff" - "\n " - "it is part of a " -- "[right-flanking " +- "[" +- "right-flanking " - "delimiter run]" - "\n " - "and either (a) " @@ -9870,7 +10037,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiter run]" - "\n " - or (b) part of a -- " [left-flanking " +- " [" +- "left-flanking " - "delimiter run]" - "\n " - "followed by a [" @@ -9880,34 +10048,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "9. " - "Emphasis begins " - with a delimiter -- " that [can open " -- "emphasis] and " -- "ends\n " +- " that [" +- "can open " +- "emphasis]" +- " and ends\n " - with a delimiter -- " that [can close" -- " emphasis], and " -- "that uses the " -- "same\n " -- "character (`_` " -- "or `*`) as the " -- "opening " -- "delimiter. The\n" -- " opening and " +- " that [" +- "can close " +- "emphasis]" +- ", and that uses " +- "the same\n " +- "character (`_`" +- " or `*`" +- ) as the opening +- " delimiter. The" +- "\n " +- "opening and " - "closing " - "delimiters must " - "belong to " - "separate\n [" -- "delimiter runs]." -- " If one of the " +- "delimiter runs]" +- ". " +- "If one of the " - "delimiters can " - "both\n " - "open and close " - "emphasis, then " - "the sum of the " - "lengths of the\n" -- " delimiter " -- "runs containing " -- "the opening and " +- " " +- "delimiter runs " +- "containing the " +- "opening and " - "closing " - "delimiters\n " - "must not be a " @@ -9919,25 +10092,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Strong emphasis " - "begins with a " - "delimiter that\n" -- " [can open " -- "strong emphasis]" +- " [" +- "can open strong " +- "emphasis]" - " and ends with a" - " delimiter that\n" -- " [can close " -- "strong emphasis]" +- " [" +- can close strong +- " emphasis]" - ", and that uses " - "the same " - "character\n (" -- "`_` or `*`) as " -- "the opening " -- "delimiter. The\n" -- " opening and " +- "`_` or `*`" +- ) as the opening +- " delimiter. The" +- "\n " +- "opening and " - "closing " - "delimiters must " - "belong to " - "separate\n [" -- "delimiter runs]." -- " If one of the " +- "delimiter runs]" +- ". " +- "If one of the " - "delimiters can " - "both open\n " - and close strong @@ -9956,11 +10133,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - are multiples of - " 3.\n\n" - "11. " -- "A literal `*` " -- character cannot -- " occur at the " -- beginning or end -- " of\n `*`" +- "A literal `*`" +- " character " +- "cannot occur at " +- the beginning or +- " end of\n `*`" - "-delimited " - "emphasis or `**`" - "-delimited " @@ -9969,11 +10146,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - is backslash- - "escaped.\n\n" - "12. " -- "A literal `_` " -- character cannot -- " occur at the " -- beginning or end -- " of\n `_`" +- "A literal `_`" +- " character " +- "cannot occur at " +- the beginning or +- " end of\n `_`" - "-delimited " - "emphasis or `__`" - "-delimited " @@ -9997,29 +10174,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Thus, for " - "example,\n " - "an " -- "interpretation `" -- "...` is " -- always preferred -- " to\n " +- "interpretation " +- "`...`" +- " is always " +- "preferred to\n" +- " " - "`...`.\n\n" - "14. " - "An " -- "interpretation `" -- "...<" -- "/strong>` " -- "is always\n " -- "preferred to `<" -- strong>...`.\n\n" -- "15. " +- "interpretation " +- "`..." +- "`" +- " is always\n " +- "preferred to " +- "`..." +- "`." +- "\n\n15. " - "When two " - "potential " - "emphasis or " - "strong emphasis " - "spans overlap,\n" -- " so that the " +- " " +- "so that the " - "second begins " - before the first - " ends and ends " @@ -10030,9 +10209,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Thus, for " - "example,\n " - "`*foo _bar* baz_" -- "` is parsed as `" -- "foo _bar baz_` rather" +- "` is parsed as " +- "`foo _bar baz_` rather" - "\n than " - "`*foo bar* " - "baz`.\n\n" @@ -10053,14 +10232,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Thus, for " - "example,\n " - "`**foo **bar baz" -- "**` is parsed as" -- " `**foo " +- "**`" +- " is parsed as " +- "`**foo " - bar baz - "`\n " -- "rather than `<" -- strong>foo **bar -- " baz`." -- "\n\n17. " +- "rather than " +- "`foo **" +- bar baz +- "`.\n\n" +- "17. " - "Inline code " - "spans, links, " - "images, and HTML" @@ -10071,20 +10252,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "is a choice " - "between an " - "interpretation\n" -- " that " -- "contains one of " -- "these elements " -- "and one that " -- "does not, the\n" -- " former " -- "always wins. " +- " " +- "that contains " +- "one of these " +- elements and one +- " that does not, " +- "the\n " +- "former always " +- "wins. " - "Thus, for " -- "example, `*[foo*" -- "](bar)` is\n " -- "parsed as `*foo*<" -- "/a>` rather than" -- " as\n " +- "example, " +- "`*[foo*](bar)`" +- " is\n " +- "parsed as " +- "`*" +- "foo*`" +- " rather than as" +- "\n " - "`[foo](" - "bar)`.\n\n" - "These rules can " @@ -10105,12 +10289,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `*` is " -- "followed by\n" +- "opening `*`" +- " is followed by\n" - "whitespace, and " - "hence not part " -- "of a [left-" -- "flanking " +- "of a [" +- "left-flanking " - "delimiter run]:" - "\n\n" - "````````````````" @@ -10125,8 +10309,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `*` is " -- "preceded\n" +- "opening `*`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10178,9 +10362,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Intraword " -- "emphasis with `*" -- "` is permitted:" -- "\n\n" +- "emphasis with " +- "`*`" +- " is permitted:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10212,8 +10396,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `_` is " -- "followed by\n" +- "opening `_`" +- " is followed by\n" - "whitespace:\n\n" - "````````````````" - "````````````````" @@ -10227,8 +10411,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "opening `_` is " -- "preceded\n" +- "opening `_`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10242,9 +10426,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "Emphasis with `_" -- "` is not allowed" -- " inside words:\n\n" +- "Emphasis with " +- "`_`" +- " is not allowed " +- "inside words:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10271,9 +10456,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "тся_

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Here `_` does " -- "not generate " +- "\n\n\nHere `_`" +- " does not " +- "generate " - "emphasis, " - "because the " - "first delimiter " @@ -10329,8 +10514,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "closing `*` is " -- "preceded by\n" +- "closing `*`" +- " is preceded by\n" - "whitespace:\n\n" - "````````````````" - "````````````````" @@ -10356,8 +10541,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "second `*`" -- " is\n" +- "second `*` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10391,8 +10575,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Intraword " -- "emphasis with `*" -- "` is allowed:\n\n" +- "emphasis with " +- "`*` is allowed:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10406,8 +10591,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "closing `_` is " -- "preceded by\n" +- "closing `_`" +- " is preceded by\n" - "whitespace:\n\n" - "````````````````" - "````````````````" @@ -10421,8 +10606,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "emphasis, " - "because the " -- "second `_`" -- " is\n" +- "second `_` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10449,8 +10633,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "Intraword " - "emphasis is " -- "disallowed for `" -- "_`:\n\n" +- "disallowed for " +- "`_`:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10528,8 +10712,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "strong emphasis," - " because the " -- "opening `**` is " -- "preceded\n" +- "opening `**`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10549,9 +10733,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - Intraword strong -- " emphasis with `" -- "**` is permitted" -- ":\n\n" +- " emphasis with " +- "`**`" +- " is permitted:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10604,8 +10788,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "strong emphasis," - " because the " -- "opening `__` is " -- "preceded\n" +- "opening `__`" +- " is preceded\n" - "by an " - alphanumeric and - " followed by " @@ -10621,8 +10805,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - Intraword strong - " emphasis is " -- "forbidden with `" -- "__`:\n\n" +- "forbidden with " +- "`__`:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -10702,15 +10886,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "(Nor can it be " - "interpreted as " -- "an emphasized `*" -- "foo bar *`, " -- "because of\n" +- "an emphasized " +- "`*foo bar *`" +- ", because of\n" - "Rule 11.)\n\n" - "This is not " - "strong emphasis," - " because the " -- "second `**`" -- " is\n" +- "second `**` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10804,8 +10987,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This is not " - "strong emphasis," - " because the " -- "second `__`" -- " is\n" +- "second `__` is\n" - "preceded by " - "punctuation and " - "followed by an " @@ -10837,8 +11019,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - Intraword strong - " emphasis is " -- "forbidden with `" -- "__`:\n\n" +- "forbidden with " +- "`__`:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -11005,8 +11187,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " that\n" - "can both open " - "and close (like " -- "the `*` after `" -- "foo`)\n" +- "the `*` after " +- "`foo`)\n" - "cannot form " - "emphasis if the " - "sum of the " @@ -11022,8 +11204,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " multiples of 3." - "\n\n\n" - "For the same " -- "reason, we don't" -- " get two " +- "reason, we don'" +- "t get two " - "consecutive\n" - "emphasis " - sections in this @@ -11086,9 +11268,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " closing and " - "opening\n" - "delimiter runs " -- "are *both* " -- "multiples of 3, " -- "though,\n" +- are *both* +- " multiples of 3," +- " though,\n" - "they can match " - "to create " - "emphasis:\n\n" @@ -11417,9 +11599,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ", Rule 11 " - "determines\n" - "that the excess " -- "literal `*` " -- "characters will " -- "appear outside " +- "literal `*`" +- " characters will" +- " appear outside " - "of the\n" - "emphasis, rather" - " than inside it:" @@ -11547,9 +11729,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ", Rule 12 " - "determines\n" - "that the excess " -- "literal `_` " -- "characters will " -- "appear outside " +- "literal `_`" +- " characters will" +- " appear outside " - "of the\n" - "emphasis, rather" - " than inside it:" @@ -11863,10 +12045,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n\n" - "## Links\n\n" - "A link contains " -- "[link text] (the" -- " visible text), " -- "a [link " -- "destination]\n" +- "[link text]" +- " (the visible " +- "text), a [" +- link destination +- "]\n" - (the URI that is - " the link " - "destination), " @@ -11876,8 +12059,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "basic kinds of " - "links in " - "Markdown. In [" -- "inline links] " -- "the\n" +- "inline links]" +- " the\n" - "destination and " - "title are given " - "immediately " @@ -11900,7 +12083,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "). The\n" - "following rules " - "apply:\n\n" -- "- Links may not " +- "- " +- "Links may not " - "contain other " - "links, at any " - level of nesting @@ -11913,80 +12097,89 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "other, the inner" - "-most definition" - " is used.\n\n" -- "- Brackets are " +- "- " +- "Brackets are " - "allowed in the [" -- "link text] only " -- "if (a) they\n " +- "link text]" +- " only if (a) " +- "they\n " - are backslash- - "escaped or (b) " - they appear as a - " matched pair of" - " brackets,\n " - "with an open " -- "bracket `[`, a " -- sequence of zero -- " or more inlines" -- ", and\n " +- "bracket `[`" +- ", a sequence of " +- "zero or more " +- "inlines, and\n " - "a close bracket " - "`]`.\n\n" -- "- Backtick [code" -- " spans], [" -- "autolinks], and " -- "raw [HTML tags] " -- "bind more " +- "- " +- "Backtick [" +- "code spans], [" +- "autolinks]" +- ", and raw [" +- "HTML tags]" +- " bind more " - "tightly\n " - "than the " - brackets in link - " text. " - "Thus, for " - "example,\n " -- "`` [foo`]` `` " -- "could not be a " +- "`` [foo`]` ``" +- " could not be a " - "link text, since" - " the second `]`" - "\n " - "is part of a " - "code span.\n\n" -- "- The brackets " -- "in link text " -- "bind more " -- "tightly than " -- "markers for\n [" +- "- " +- "The brackets in " +- "link text bind " +- "more tightly " +- than markers for +- "\n [" - "emphasis and " - "strong emphasis]" - ". " - "Thus, for " -- "example, `*[foo*" -- "](url)` is a " -- "link.\n\n" -- "A [link " -- "destination](@) " -- "consists of " +- "example, " +- "`*[foo*](url)`" +- " is a link.\n\n" +- "A " +- "[link " +- "destination](@)" +- " consists of " - "either\n\n" -- "- a sequence of " +- "- " +- "a sequence of " - "zero or more " - "characters " - "between an " -- "opening `<` and " -- "a\n closing `>`" +- "opening `<`" +- " and a\n " +- "closing `>`" - " that contains " - "no line endings " - "or unescaped\n " -- "`<` or `>` " -- "characters, or\n\n" -- "- a nonempty " +- "`<` or `>`" +- " characters, or" +- "\n\n- " +- "a nonempty " - "sequence of " - "characters that " - "does not start " -- "with `<`" -- ",\n " +- "with `<`,\n " - does not include -- " [ASCII control " +- " [" +- "ASCII control " - "characters][" - "ASCII control " - "character]\n or " -- "[space] " -- "character, and " +- "[space]" +- " character, and " - "includes " - parentheses only - " if (a) they are" @@ -12011,60 +12204,69 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "nesting\n " - "should be " - "supported.)\n\n" -- "A [link title](@" -- ") consists of " +- "A " +- "[link title](@)" +- " consists of " - "either\n\n" -- "- a sequence of " +- "- " +- "a sequence of " - "zero or more " - "characters " - between straight - " double-quote\n " -- "characters (`\"`)" -- ", including a `\"" -- "` character only" -- " if it is\n " +- "characters (`\"`" +- "), including a " +- "`\"`" +- " character only " +- "if it is\n " - backslash- - "escaped, or\n\n" -- "- a sequence of " +- "- " +- "a sequence of " - "zero or more " - "characters " - between straight - " single-quote\n " -- "characters (`'`)" -- ", including a `'" -- "` character only" -- " if it is\n " +- "characters (`'`" +- "), including a " +- "`'`" +- " character only " +- "if it is\n " - backslash- - "escaped, or\n\n" -- "- a sequence of " +- "- " +- "a sequence of " - "zero or more " - "characters " - between matching - " parentheses\n (" -- "`(...)`), " -- "including a `(` " -- "or `)` character" -- " only if it is\n" -- " backslash-" +- "`(...)`" +- "), including a " +- "`(` or `)`" +- " character only " +- "if it is\n " +- backslash- - "escaped.\n\n" -- "Although [link " -- "titles] may span" -- " multiple lines," -- " they may not " +- "Although [" +- "link titles]" +- " may span " +- "multiple lines, " +- "they may not " - "contain\na [" - "blank line].\n\n" -- "An [inline link]" -- "(@) consists of " -- "a [link text] " -- "followed " +- "An " +- "[inline link](@)" +- " consists of a [" +- "link text]" +- " followed " - "immediately\n" - "by a left " -- "parenthesis `(`," -- " an optional [" +- "parenthesis `(`" +- ", an optional [" - link destination - "], an optional\n[" -- "link title], and" -- " a right " +- "link title]" +- ", and a right " - "parenthesis `)`." - "\n" - "These four " @@ -12073,41 +12275,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces, tabs, " - "and up to one " - "line\nending.\n" -- "If both [link " -- "destination] and" -- " [link title] " -- "are present, " -- they *must* -- " be\n" +- "If both [" +- link destination +- "] and [" +- "link title]" +- " are present, " +- "they *must* be\n" - "separated by " - "spaces, tabs, " - "and up to one " - "line ending.\n\n" -- "The link's text " -- "consists of the " -- "inlines " +- "The link'" +- "s text consists " +- "of the inlines " - "contained\n" -- "in the [link " -- "text] (excluding" -- " the enclosing " -- square brackets) -- ".\nThe link'" +- "in the [" +- "link text]" +- " (excluding the " +- enclosing square +- " brackets).\n" +- "The link'" - "s URI consists " - "of the link " - "destination, " - "excluding " - "enclosing\n" -- "`<...>` if " -- "present, with " -- backslash- +- "`<...>`" +- " if present, " +- with backslash- - "escapes in " - "effect as " - "described\n" - above. The link -- "'s title " -- "consists of the " -- "link title, " -- "excluding its\n" +- "'" +- s title consists +- " of the link " +- "title, excluding" +- " its\n" - "enclosing " - "delimiters, with" - " backslash-" @@ -12235,8 +12439,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n" - "The destination " -- "can contain `)` " -- "if it is " +- "can contain `)`" +- " if it is " - "enclosed\n" - "in pointy " - "brackets:\n\n" @@ -12539,11 +12743,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces, tabs, " - "and up to one " - "line\nending.\n" -- "Other [Unicode " -- "whitespace] like" -- " non-breaking " -- "space doesn't " -- "work.\n\n" +- "Other [" +- "Unicode " +- "whitespace]" +- " like non-" +- "breaking space " +- "doesn't work.\n\n" - "````````````````" - "````````````````" - " example\n" @@ -12591,12 +12796,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/p>\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "(Note: `" -- "Markdown.pl` did" -- " allow double " -- "quotes inside a " -- "double-quoted\n" +- "\n\n\n(Note: " +- "`Markdown.pl`" +- " did allow " +- "double quotes " +- inside a double- +- "quoted\n" - "title, and its " - "test suite " - "included a test " @@ -12610,8 +12815,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "brings, since " - "there are " - "already many " -- ways---backslash -- " escaping,\n" +- ways--- +- "backslash " +- "escaping,\n" - "entity and " - "numeric " - "character " @@ -12620,12 +12826,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "different\n" - "quote type for " - "the enclosing " -- title---to write -- " titles " +- title--- +- "to write titles " - "containing\n" - "double quotes. " -- "`Markdown.pl`'s " -- "handling of " +- "`Markdown.pl`'" +- "s handling of " - "titles has a " - "number\n" - of other strange @@ -12642,10 +12848,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "links, it allows" - " a title to " - "begin\nwith `\"`" -- " and end with `)" -- "`. " -- "`Markdown.pl` " -- "1.0.1 even " +- " and end with " +- "`)`. " +- "`Markdown.pl`" +- " 1.0.1 even " - "allows\n" - "titles with no " - "closing " @@ -12847,11 +13053,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "Note that " -- brackets that * -- "aren't* part of " -- "links do not " -- "take\nprecedence:" -- "\n\n" +- "brackets that " +- "*aren't*" +- " part of links " +- "do not take\n" +- "precedence:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -12909,39 +13115,42 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "There are three " -- "kinds of [" -- "reference link](" -- "@)s:\n" +- "kinds of " +- "[reference link]" +- "(@)s:\n" - "[full](#full-" -- "reference-link)," -- " [collapsed](#" +- reference-link) +- ", " +- "[collapsed](#" - collapsed- - "reference-link)," - "\nand " - "[shortcut](#" - shortcut- - reference-link). -- "\n\n" -- "A [full " -- "reference link](" -- "@)\n" +- "\n\nA " +- "[full reference " +- "link](@)\n" - "consists of a [" -- "link text] " -- "immediately " +- "link text]" +- " immediately " - "followed by a [" - "link label]\n" -- "that [matches] a" -- " [link reference" -- " definition] " -- elsewhere in the -- " document.\n\n" -- "A [link label](@" -- ) begins with a -- " left bracket (`" -- "[`) and ends\n" +- "that [matches]" +- " a [" +- "link reference " +- "definition]" +- " elsewhere in " +- "the document.\n\n" +- "A " +- "[link label](@)" +- " begins with a " +- left bracket ( +- "`[`) and ends\n" - "with the first " -- "right bracket (`" -- "]`) that is not " +- right bracket ( +- "`]`" +- ") that is not " - backslash- - "escaped.\n" - "Between these " @@ -12967,9 +13176,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters " - "inside the " - "square\nbrackets." -- "\n\n" -- "One label [" -- "matches](@)\n" +- "\n\nOne label " +- "[matches](@)\n" - "another just in " - "case their " - normalized forms @@ -12979,11 +13187,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " the opening and" - " closing " - "brackets,\n" -- perform the * -- "Unicode case " -- "fold*, strip " -- "leading and " -- "trailing\n" +- "perform the " +- "*Unicode case " +- fold* +- ", strip leading " +- "and trailing\n" - "spaces, tabs, " - and line endings - ", and collapse " @@ -13006,10 +13214,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " in such cases " - "to emit a " - "warning.)\n\n" -- "The link's URI " -- "and title are " -- "provided by the " -- "matching [link\n" +- "The link'" +- "s URI and title " +- "are provided by " +- "the matching [" +- "link\n" - "reference " - "definition].\n\n" - Here is a simple @@ -13027,9 +13236,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n\n" - "The rules for " -- "the [link text] " -- "are the same as " -- "with\n[" +- "the [link text]" +- " are the same as" +- " with\n[" - "inline links]" - ". Thus:\n\n" - "The link text " @@ -13132,10 +13341,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - (In the examples - " above, we have " -- "two [shortcut " +- "two [" +- "shortcut " - "reference links]" - "\ninstead of one " -- "[full reference " +- "[" +- "full reference " - "link].)\n\n" - "The following " - cases illustrate @@ -13265,9 +13476,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " or line endings" - " are allowed " - "between the [" -- "link text] and " -- "the\n[link label]" -- ":\n\n" +- "link text]" +- " and the\n[" +- "link label]:\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13296,8 +13507,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "This is a " - "departure from " -- "John Gruber's " -- "original " +- "John Gruber'" +- "s original " - "Markdown syntax\n" - "description, " - which explicitly @@ -13310,10 +13521,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "It brings " - "reference links " - "in line with\n[" -- "inline links], " -- which (according -- " to both " -- "original " +- "inline links]" +- ", which (" +- "according to " +- "both original " - "Markdown and\n" - "this spec) " - "cannot have " @@ -13353,8 +13564,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " by Gruber\n" - "himself in a " - "beta version of " -- "`Markdown.pl`, " -- "but never " +- "`Markdown.pl`" +- ", but never " - "included\n" - "in the official " - "syntax " @@ -13379,8 +13590,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "results.)\n\n" - "When there are " - "multiple " -- "matching [link " -- "reference " +- "matching [" +- "link reference " - "definitions],\n" - "the first is " - "used:\n\n" @@ -13420,9 +13631,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "[Link labels] " -- "cannot contain " +- "\n\n\n[Link labels]" +- " cannot contain " - "brackets, unless" - " they are\n" - backslash- @@ -13490,10 +13700,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\">bar\\

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "A [link label] " -- "must contain at " -- "least one " +- "\n\n\nA [link label" +- "]" +- " must contain at" +- " least one " - "character that " - "is not a space, " - "tab, or\n" @@ -13516,28 +13726,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "]: /uri

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "A [collapsed " +- "\n\n\nA " +- "[collapsed " - "reference link](" - "@)\n" - "consists of a [" -- "link label] that" -- " [matches] a\n[" +- "link label]" +- " that [matches]" +- " a\n[" - "link reference " -- "definition] " -- elsewhere in the -- "\n" +- "definition]" +- " elsewhere in " +- "the\n" - "document, " - "followed by the " -- "string `[]`" -- ".\n" +- "string `[]`.\n" - "The contents of " - "the link label " - "are parsed as " - "inlines,\n" - "which are used " -- "as the link's " -- "text. The link'" +- "as the link'" +- "s text. " +- "The link'" - "s URI and title " - "are\n" - "provided by the " @@ -13607,36 +13818,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n[]

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "A [shortcut " +- "\n\n\nA " +- "[shortcut " - "reference link](" - "@)\n" - "consists of a [" -- "link label] that" -- " [matches] a\n[" +- "link label]" +- " that [matches]" +- " a\n[" - "link reference " -- "definition] " -- elsewhere in the -- "\n" +- "definition]" +- " elsewhere in " +- "the\n" - "document and is " - "not followed by " -- "`[]` or a link " -- "label.\n" +- "`[]`" +- " or a link label" +- ".\n" - "The contents of " - "the link label " - "are parsed as " - "inlines,\n" - "which are used " -- "as the link's " -- "text. The link'" +- "as the link'" +- "s text. " +- "The link'" - "s URI and title\n" - "are provided by " - "the matching " - "link reference " - "definition.\n" -- "Thus, `[foo]` is" -- " equivalent to `" -- "[foo][]`.\n\n" +- "Thus, `[foo]`" +- " is equivalent " +- "to `[foo][]`.\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13809,11 +14023,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "\n\n" - In the following -- " case `[bar][baz" -- "]` is parsed as " -- "a reference,\n" -- "`[foo]` as " -- "normal text:\n\n" +- " case " +- "`[bar][baz]`" +- " is parsed as a " +- "reference,\n" +- "`[foo]`" +- " as normal text:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13825,9 +14041,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "Here, though, `[" -- "foo][bar]` is " -- "parsed as a " +- "Here, though, " +- "`[foo][bar]`" +- " is parsed as a " - "reference, since" - "\n`[bar]`" - " is defined:\n\n" @@ -13843,17 +14059,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Here `[foo]` is " -- "not parsed as a " -- "shortcut " +- "\n\n\nHere `[foo]`" +- " is not parsed " +- "as a shortcut " - "reference, " - "because it\n" - is followed by a - " link label (" -- "even though `[" -- "bar]` is not " -- "defined):\n\n" +- "even though " +- "`[bar]`" +- " is not defined)" +- ":\n\n" - "````````````````" - "````````````````" - " example\n" @@ -13872,17 +14088,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the syntax for " - "links, with one\n" - "difference. " -- "Instead of [link" -- " text], we have " -- "an\n" +- "Instead of [" +- "link text]" +- ", we have an\n" - "[image " -- "description](@)." -- " The rules for " +- "description](@)" +- ". " +- "The rules for " - "this are the\n" - "same as for [" -- "link text], " -- "except that (a) " -- "an\n" +- "link text]" +- ", except that (a" +- ") an\n" - "image " - "description " - "starts with `![`" @@ -13900,8 +14117,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "HTML,\n" - "this is " - "standardly used " -- "as the image's `" -- "alt` attribute." +- "as the image's " +- "`alt` attribute." - "\n\n" - "````````````````" - "````````````````" @@ -13966,14 +14183,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "plain string " - "content\nof the [" - "image " -- "description] be " -- "used. " +- "description]" +- " be used. " - "Note that in\n" - "the above " - "example, the alt" -- " attribute's " -- "value is `foo " -- "bar`, not " +- " attribute'" +- "s value is " +- "`foo bar`, not " - "`foo\n" - "[bar](/url)` or " - "`foo `" @@ -14263,50 +14480,51 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " address\n" - "as the link " - "label.\n\n" -- "A [URI autolink]" -- "(@) consists of " -- "`<`, followed by" -- " an\n[" -- "absolute URI] " -- "followed by `>`." -- " It is parsed " -- "as\n" +- "A " +- "[URI autolink](@" +- ") consists of " +- "`<`" +- ", followed by an" +- "\n[absolute URI]" +- " followed by `>`" +- ". " +- "It is parsed as\n" - "a link to the " - "URI, with the " -- "URI as the " -- "link's label.\n\n" -- "An [absolute URI" -- "](@),\n" +- "URI as the link'" +- "s label.\n\n" +- "An " +- "[absolute URI](@" +- "),\n" - "for these " - "purposes, " - "consists of a [" -- "scheme] followed" -- " by a colon (`:`" -- ")\n" +- "scheme]" +- " followed by a " +- "colon (`:`)\n" - followed by zero - " or more " - characters other -- " than [ASCII " -- "control\n" +- " than [" +- "ASCII control\n" - "characters][" - "ASCII control " - "character], [" -- "space], `<`, and" -- " `>`" -- ".\n" +- "space], `<`" +- ", and `>`.\n" - "If the URI " - "includes these " - "characters, they" - " must be percent" - "-encoded\n(e.g. " -- "`%20` for a " -- "space).\n\n" +- "`%20`" +- " for a space).\n\n" - "For purposes of " -- "this spec, a [" -- "scheme](@) is " -- "any sequence\n" -- "of 2--32 " -- "characters " +- "this spec, a " +- "[scheme](@)" +- " is any sequence" +- "\nof 2--" +- "32 characters " - "beginning with " - "an ASCII letter " - "and followed\n" @@ -14315,9 +14533,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "ASCII letters, " - "digits, or the " - "symbols plus\n(\"+" -- "\"), period (\".\")" -- ", or hyphen (\"-\"" -- ").\n\n" +- "\"), period (\".\"" +- "), or hyphen (\"-" +- "\").\n\n" - "Here are some " - "valid autolinks:" - "\n\n" @@ -14388,8 +14606,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Note that many " - "strings that " - "count as [" -- "absolute URIs] " -- "for\n" +- "absolute URIs]" +- " for\n" - purposes of this - " spec are not " - "valid URIs, " @@ -14479,27 +14697,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "/a>

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "An [email " -- "autolink](@)\n" -- "consists of `<`," -- " followed by an " -- "[email address]," -- "\nfollowed by `>`" -- ". The link'" +- "\n\n\nAn " +- "[email autolink]" +- "(@)\nconsists of " +- "`<`" +- ", followed by an" +- " [email address]" +- ",\nfollowed by " +- "`>`. The link'" - "s label is the " - "email address,\n" -- "and the URL is `" -- "mailto:` " -- "followed by the " -- "email address.\n\n" -- "An [email " -- "address](@),\n" +- "and the URL is " +- "`mailto:`" +- " followed by the" +- " email address.\n" +- "\nAn " +- "[email address](" +- "@),\n" - "for these " - "purposes, is " - "anything that " - "matches\nthe " -- "[non-normative " +- "[" +- "non-normative " - "regex from the " - "HTML5\nspec" - "](https://" @@ -14633,9 +14853,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "## Raw HTML\n\n" - "Text between `<`" -- " and `>` that " -- "looks like an " -- "HTML tag is " +- " and `>`" +- " that looks like" +- " an HTML tag is " - "parsed as a\n" - raw HTML tag and - " will be " @@ -14654,16 +14874,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Here is the " - grammar for tags - ":\n\n" -- "A [tag name](@) " -- "consists of an " +- "A [tag name](@)" +- " consists of an " - "ASCII letter\n" - followed by zero - " or more ASCII " - "letters, digits," - " or\nhyphens (`-`" - ").\n\n" -- "An [attribute](@" -- ") consists of " +- "An " +- "[attribute](@)" +- " consists of " - "spaces, tabs, " - "and up to one " - "line ending,\nan " @@ -14672,17 +14893,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "optional\n[" - "attribute value " - "specification]." -- "\n\n" -- "An [attribute " -- "name](@)\n" +- "\n\nAn " +- "[attribute name]" +- "(@)\n" - "consists of an " -- "ASCII letter, `_" -- "`, or `:`, " -- followed by zero -- " or more ASCII\n" +- "ASCII letter, " +- "`_`, or `:`" +- ", followed by " +- "zero or more " +- "ASCII\n" - "letters, digits," -- " `_`, `.`, `:`, " -- "or `-`" +- " `_`, `.`, `:`" +- ", or `-`" - ". " - "(Note: This is " - "the XML\n" @@ -14690,11 +14912,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "restricted to " - "ASCII. " - HTML5 is laxer.) -- "\n\n" -- "An [attribute " -- "value " -- "specification](@" -- ")\n" +- "\n\nAn " +- "[attribute value" +- " specification](" +- "@)\n" - "consists of " - "optional spaces," - " tabs, and up to" @@ -14707,18 +14928,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ",\nand an [" - "attribute value]" - ".\n\n" -- "An [attribute " -- "value](@)\n" +- "An " +- "[attribute value" +- "](@)\n" - "consists of an [" - "unquoted " - "attribute value]" - ",\na [" - "single-quoted " - "attribute value]" -- ", or a [double-" -- quoted attribute -- " value].\n\n" -- "An [unquoted " +- ", or a [" +- "double-quoted " +- "attribute value]" +- ".\n\n" +- "An " +- "[unquoted " - "attribute value]" - "(@)\n" - "is a nonempty " @@ -14726,60 +14950,66 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters not\n" - including spaces - ", tabs, line " -- "endings, `\"`, `'" -- "`, `=`, `<`, `>`" -- ", or `` ` ``.\n\n" -- "A [single-quoted" -- " attribute value" -- "](@)\n" -- "consists of `'`," -- " zero or more\n" +- "endings, `\"`, " +- "`'`, `=`, `<`, " +- "`>`, or `` ` ``." +- "\n\nA " +- "[single-quoted " +- "attribute value]" +- "(@)\nconsists of " +- "`'`" +- ", zero or more\n" - "characters not " -- "including `'`, " -- "and a final `'`." -- "\n\n" -- "A [double-quoted" -- " attribute value" -- "](@)\n" -- "consists of `\"`," -- " zero or more\n" +- "including `'`" +- ", and a final " +- "`'`.\n\n" +- "A " +- "[double-quoted " +- "attribute value]" +- "(@)\nconsists of " +- "`\"`" +- ", zero or more\n" - "characters not " -- "including `\"`, " -- "and a final `\"`." -- "\n\n" +- "including `\"`" +- ", and a final " +- "`\"`.\n\n" - "An [open tag](@)" -- " consists of a `" -- "<` character, a " -- "[tag name],\n" +- " consists of a " +- "`<`" +- " character, a [" +- "tag name],\n" - "zero or more [" -- "attributes], " -- "optional spaces," -- " tabs, and up to" -- " one line ending" -- ",\nan optional " -- "`/` character, " -- "and a `>` " -- "character.\n\n" -- "A [closing tag](" -- "@) consists of " -- "the string `` character." +- "\n\nA " +- "[closing tag](@)" +- " consists of the" +- " string ``.\n\n" -- "An [HTML comment" -- "](@) consists of" -- " ``, ``, " +- "``, or " +- "``, " -- "and `-->` (see " -- "the\n" +- "string `-->`" +- ", and `-->`" +- " (see the\n" - "[HTML spec](" - "https://" - html.spec.whatwg @@ -14788,7 +15018,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - markup- - declaration-open - "-state)).\n\n" -- "A [processing " +- "A " +- "[processing " - "instruction](@)\n" - "consists of the " - "string ``" - ", and the string" - "\n`?>`.\n\n" -- "A [declaration](" -- "@) consists of " -- "the string ``, " -- "and the " +- "character `>`" +- ", and the " - "character `>`.\n\n" -- "A [CDATA section" -- "](@) consists of" -- "\nthe string " +- "A " +- "[CDATA section](" +- "@) consists of\n" +- "the string " - "`\n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Closing tags:\n" -- "\n" +- "\n\n\nClosing tags:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15039,9 +15273,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ">

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "Declarations:\n" -- "\n" +- "\n\n\nDeclarations:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15107,8 +15340,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Hard line " -- "breaks\n\n" +- "## " +- Hard line breaks +- "\n\n" - A line ending ( - "not in a code " - span or HTML tag @@ -15118,11 +15352,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces and does " - not occur at the - " end of a block\n" -- "is parsed as a [" -- "hard line break]" -- "(@) (rendered\n" -- "in HTML as a `<" -- "br />` tag):\n\n" +- "is parsed as a " +- "[hard line break" +- "](@) (rendered\n" +- "in HTML as a " +- "`
          ` tag):\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15137,10 +15371,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "alternative, a " - backslash before - " the\n[" -- "line ending] may" -- " be used instead" -- " of two or more " -- "spaces:\n\n" +- "line ending]" +- " may be used " +- "instead of two " +- "or more spaces:\n" +- "\n" - "````````````````" - "````````````````" - " example\n" @@ -15233,9 +15468,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "span

          \n" - "````````````````" - "````````````````" -- "\n\n\n" -- "or HTML tags:\n" -- "\n" +- "\n\n\nor HTML tags:" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15300,8 +15534,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "\n\n\n" -- "## Soft line " -- "breaks\n\n" +- "## " +- Soft line breaks +- "\n\n" - "A regular line " - ending (not in a - " code span or " @@ -15311,20 +15546,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "or more spaces " - "or a backslash " - "is parsed as a\n" -- "[softbreak](@)." -- " (A soft line " +- "[softbreak](@)" +- ". " +- "(A soft line " - "break may be " - rendered in HTML - " either as a\n[" -- "line ending] or " -- "as a space. " +- "line ending]" +- " or as a space. " - "The result will " - "be the same in\n" - "browsers. " - "In the examples " -- "here, a [line " -- "ending] will be " -- "used.)\n\n" +- "here, a [" +- "line ending]" +- " will be used.)" +- "\n\n" - "````````````````" - "````````````````" - " example\n" @@ -15361,8 +15598,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "breaks\n" - "as hard line " - "breaks.\n\n" -- "## Textual " -- "content\n\n" +- "## " +- Textual content +- "\n\n" - "Any characters " - "not given an " - "interpretation " @@ -15404,7 +15642,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "`." +- "It ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [" +- "end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line." +- "1." +- "**Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``," +- "``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``." - "3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``." - "5. **Start condition:** line begins with the string\n``." -- "6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" - "line is followed by a [blank line]." -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\" +- "7." +- "**Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\" - "**End condition:** line is followed by a [blank line]." -- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML" -- "block** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state." +- "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock**" +- " that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state." - "For instance, `
    ` within an HTML block started by `
    ` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:" - "```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "In this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following." @@ -315,12 +325,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n*foo*\n.\n

    foo

    \n````````````````````````````````" - "HTML tags designed to contain literal content\n(`pre`, `script`, `style`, `textarea`), comments, processing instructions,\nand declarations are treated somewhat differently.\nInstead of ending at the first blank line, these blocks" - "end at the first line containing a corresponding end tag.\nAs a result, these blocks can contain blank lines:\n\nA pre tag (type 1):" -- "```````````````````````````````` example\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()"
    -- "main = print $ parseTags tags\n
    \n

    okay

    \n````````````````````````````````\n\n\nA script tag (type 1):" -- "```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nA textarea tag (type 1):" -- "```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\nA style tag (type 1):" -- "```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

    okay

    \n````````````````````````````````" +- "```````````````````````````````` example" +- "
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \nokay\n.\n
    \nimport Text.HTML.TagSoup\n\nmain :: IO ()\nmain = print $ parseTags tags\n
    \n

    okay

    " +- "````````````````````````````````\n\n\nA script tag (type 1):" +- "```````````````````````````````` example" +- "\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nA textarea tag (type 1):\n\n```````````````````````````````` example\n\n.\n\n````````````````````````````````" +- "A style tag (type 1):\n\n```````````````````````````````` example\n\nh1 {color:red;}\n\np {color:blue;}\n\nokay\n.\n\nh1 {color:red;}\n\np {color:blue;}\n\n

    okay

    \n````````````````````````````````" - "If there is no matching end tag, the block will end at the\nend of the document (or the enclosing [block quote][block quotes]\nor [list item][list items]):" - "```````````````````````````````` example\n\n\nfoo\n.\n\n\nfoo\n````````````````````````````````" - "```````````````````````````````` example\n>
    \n> foo\n\nbar\n.\n
    \n
    \nfoo\n
    \n

    bar

    \n````````````````````````````````" @@ -331,16 +342,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\n\nA processing instruction (type 3):" - "```````````````````````````````` example\n';\n\n?>\nokay\n.\n';\n\n?>\n

    okay

    \n````````````````````````````````\n\n\nA declaration (type 4):" - "```````````````````````````````` example\n\n.\n\n````````````````````````````````\n\n\nCDATA (type 5):" -- "```````````````````````````````` example\n\nokay\n.\n\n

    okay

    \n````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:" +- "```````````````````````````````` example" +- "\nokay\n.\n\n

    okay

    " +- "````````````````````````````````\n\n\nThe opening tag can be preceded by up to three spaces of indentation, but not\nfour:" - "```````````````````````````````` example\n \n\n \n.\n \n
    <!-- foo -->\n
    \n````````````````````````````````" - "```````````````````````````````` example\n
    \n\n
    \n.\n
    \n
    <div>\n
    \n````````````````````````````````\n\n\nAn HTML block of types 1--6 can interrupt a paragraph, and need not be\npreceded by a blank line." - "```````````````````````````````` example\nFoo\n
    \nbar\n
    \n.\n

    Foo

    \n
    \nbar\n
    \n````````````````````````````````" - "However, a following blank line is needed, except at the end of\na document, and except for blocks of types 1--5, [above][HTML\nblock]:\n\n```````````````````````````````` example\n
    \nbar\n
    \n*foo*\n.\n
    \nbar\n
    \n*foo*\n````````````````````````````````" - "HTML blocks of type 7 cannot interrupt a paragraph:\n\n```````````````````````````````` example\nFoo\n
    \nbaz\n.\n

    Foo\n\nbaz

    \n````````````````````````````````" - "This rule differs from John Gruber's original Markdown syntax\nspecification, which says:" -- "> The only restrictions are that block-level HTML elements —\n> e.g. `
    `, ``, `
    `, `

    `, etc. — must be separated from\n> surrounding content by blank lines, and the start and end tags of the" -- "> block should not be indented with spaces or tabs.\n\nIn some ways Gruber's rule is more restrictive than the one given\nhere:" +- ">" +- "The only restrictions are that block-level HTML elements —\n> e.g. `

    `, `
    `, `
    `, `

    `, etc. — must be separated from\n> surrounding content by blank lines, and the start and end tags of the\n> block should not be indented with spaces or tabs." +- "In some ways Gruber's rule is more restrictive than the one given\nhere:" - "- It requires that an HTML block be preceded by a blank line.\n- It does not allow the start tag to be indented.\n- It requires a matching end tag, which it also does not allow to\n be indented." - "Most Markdown implementations (including some of Gruber's own) do not\nrespect all of these restrictions." - "There is one respect, however, in which Gruber's rule is more liberal\nthan the one given here, since it allows blank lines to occur inside\nan HTML block. There are two reasons for disallowing them here." @@ -358,8 +371,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Link reference definitions" - "A [link reference definition](@)\nconsists of a [link label], optionally preceded by up to three spaces of\nindentation, followed\nby a colon (`:`), optional spaces or tabs (including up to one\n[line ending]), a [link destination]," - "optional spaces or tabs (including up to one\n[line ending]), and an optional [link\ntitle], which if it is present must be separated\nfrom the [link destination] by spaces or tabs.\nNo further character may occur." -- "A [link reference definition]\ndoes not correspond to a structural element of a document. Instead, it\ndefines a label which can be used in [reference links]\nand reference-style [images] elsewhere in the document. [Link" -- "reference definitions] can come either before or after the links that use\nthem.\n\n```````````````````````````````` example\n[foo]: /url \"title\"\n\n[foo]\n.\n

    foo

    \n````````````````````````````````" +- "A [link reference definition]\ndoes not correspond to a structural element of a document. Instead, it\ndefines a label which can be used in [reference links]\nand reference-style [images] elsewhere in the document. [Link\nreference definitions]" +- " can come either before or after the links that use\nthem.\n\n```````````````````````````````` example\n[foo]: /url \"title\"\n\n[foo]\n.\n

    foo

    \n````````````````````````````````" - "```````````````````````````````` example\n [foo]: \n /url \n 'the title' \n\n[foo]\n.\n

    foo

    \n````````````````````````````````" - "```````````````````````````````` example\n[Foo*bar\\]]:my_(url) 'title (with parens)'\n\n[Foo*bar\\]]\n.\n

    Foo*bar]

    \n````````````````````````````````" - "```````````````````````````````` example\n[Foo bar]:\n\n'title'\n\n[Foo bar]\n.\n

    Foo bar

    \n````````````````````````````````\n\n\nThe title may extend over multiple lines:" @@ -403,15 +416,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n \n\naaa\n \n\n# aaa\n\n \n.\n

    aaa

    \n

    aaa

    \n````````````````````````````````" - "# Container blocks\n\nA [container block](#container-blocks) is a block that has other\nblocks as its contents. There are two basic kinds of container blocks:\n[block quotes] and [list items].\n[Lists] are meta-containers for [list items]." - "We define the syntax for container blocks recursively. The general\nform of the definition is:\n\n> If X is a sequence of blocks, then the result of\n> transforming X in such-and-such a way is a container of type Y\n> with these blocks as its content." -- "So, we explain what counts as a block quote or list item by explaining\nhow these can be *generated* from their contents. This should suffice\nto define the syntax, although it does not give a recipe for *parsing*\nthese constructions." -- "(A recipe is provided below in the section entitled\n[A parsing strategy](#appendix-a-parsing-strategy).)" +- "So, we explain what counts as a block quote or list item by explaining\nhow these can be *generated* from their contents. This should suffice\nto define the syntax, although it does not give a recipe for *parsing*" +- "these constructions. (A recipe is provided below in the section entitled\n[A parsing strategy](#appendix-a-parsing-strategy).)" - "## Block quotes\n\nA [block quote marker](@),\noptionally preceded by up to three spaces of indentation,\nconsists of (a) the character `>` together with a following space of\nindentation, or (b) a single character `>` not followed by a space of\nindentation." - "The following rules define [block quotes]:" -- "1. **Basic case.** If a string of lines *Ls* constitute a sequence\n of blocks *Bs*, then the result of prepending a [block quote\n marker] to the beginning of each line in *Ls*\n is a [block quote](#block-quotes) containing *Bs*.\n\n2. **Laziness." -- "** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*" -- ", then the result of deleting\n the initial [block quote marker] from one or\n more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs*" -- " as its content.\n [Paragraph continuation text](@) is text\n that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph." -- "3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them." +- "1. **Basic case.** If a string of lines *Ls* constitute a sequence\n of blocks *Bs*, then the result of prepending a [block quote\n marker] to the beginning of each line in *Ls*\n is a [block quote](#block-quotes) containing *Bs*." +- "2." +- "**Laziness.** If a string of lines *Ls* constitute a [block\n quote](#block-quotes) with contents *Bs*, then the result of deleting\n the initial [block quote marker] from one or" +- "more lines in which the next character other than a space or tab after the\n [block quote marker] is [paragraph continuation\n text] is a block quote with *Bs* as its content.\n [Paragraph continuation text](@) is text" +- "that will be parsed as part of the content of a paragraph, but does\n not occur at the beginning of the paragraph.\n\n3. **Consecutiveness.** A document cannot contain two [block\n quotes] in a row unless there is a [blank line] between them." - "Nothing else counts as a [block quote](#block-quotes).\n\nHere is a simple example:\n\n```````````````````````````````` example\n> # Foo\n> bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "The space or tab after the `>` characters can be omitted:\n\n```````````````````````````````` example\n># Foo\n>bar\n> baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" - "The `>` characters can be preceded by up to three spaces of indentation:\n\n```````````````````````````````` example\n > # Foo\n > bar\n > baz\n.\n
    \n

    Foo

    \n

    bar\nbaz

    \n
    \n````````````````````````````````" @@ -445,16 +458,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character." - "An [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)" - "The following rules define [list items]:" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation," -- " then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs*" -- " as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" +- "1." +- "**Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation," +- "then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item" +- "(bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" - "1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if" -- " the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." +- "the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." - "For example, let *Ls* be the lines" - "```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    " - "````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:" -- "```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      " -- "
    2. \n
    \n````````````````````````````````" +- "```````````````````````````````` example" +- "1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````" - "The most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list" - "marker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem." - "Here are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````" @@ -478,8 +493,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n003. ok\n.\n
      \n
    1. ok
    2. \n
    \n````````````````````````````````\n\n\nA start number may not be negative:" - "```````````````````````````````` example\n-1. not ok\n.\n

    -1. not ok

    \n````````````````````````````````" - "2. **Item starting with indented code.** If a sequence of lines *Ls*\n constitute a sequence of blocks *Bs* starting with an indented code\n block, and *M* is a list marker of width *W* followed by" -- " one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a" +- "one space of indentation, then the result of prepending *M* and the\n following space to the first line of *Ls*, and indenting subsequent lines\n of *Ls* by *W + 1* spaces, is a list item with *Bs* as its contents." +- "If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a" - "start number, based on the ordered list marker.\n\nAn indented code block will have to be preceded by four spaces of indentation\nbeyond the edge of the region where text will be included in the list item.\nIn the following case that is 6 spaces:" - "```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n
      bar\n
      \n
    • \n
    \n````````````````````````````````\n\n\nAnd in this case it is 11 spaces:" - "```````````````````````````````` example\n 10. foo\n\n bar\n.\n
      \n
    1. \n

      foo

      \n
      bar\n
      \n
    2. \n
    \n````````````````````````````````" @@ -488,15 +503,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n1. indented code\n\n paragraph\n\n more code\n.\n
      \n
    1. \n
      indented code\n
      \n

      paragraph

      \n
      more code\n
      \n
    2. \n
    \n````````````````````````````````" - "Note that an additional space of indentation is interpreted as space\ninside the code block:" - "```````````````````````````````` example\n1. indented code\n\n paragraph\n\n more code\n.\n
      \n
    1. \n
       indented code\n
      \n

      paragraph

      \n
      more code\n
      \n
    2. \n
    \n````````````````````````````````" -- "Note that rules #1 and #2 only apply to two cases: (a) cases\nin which the lines to be included in a list item begin with a\ncharacter other than a space or tab, and (b) cases in which\nthey begin with an indented code\nblock." -- "In a case like the following, where the first block begins with\nthree spaces of indentation, the rules do not allow us to form a list item by\nindenting the whole thing and prepending a list marker:" +- "Note that rules #1 and #2 only apply to two cases: (a) cases\nin which the lines to be included in a list item begin with a\ncharacter other than a space or tab, and (b) cases in which\nthey begin with an indented code" +- "block. In a case like the following, where the first block begins with\nthree spaces of indentation, the rules do not allow us to form a list item by\nindenting the whole thing and prepending a list marker:" - "```````````````````````````````` example\n foo\n\nbar\n.\n

    foo

    \n

    bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • foo
    • \n
    \n

    bar

    \n````````````````````````````````" - "This is not a significant restriction, because when a block is preceded by up to\nthree spaces of indentation, the indentation can always be removed without\na change in interpretation, allowing rule #1 to be applied. So, in\nthe above case:" - "```````````````````````````````` example\n- foo\n\n bar\n.\n
      \n
    • \n

      foo

      \n

      bar

      \n
    • \n
    \n````````````````````````````````" -- "3. **Item starting with a blank line.** If a sequence of lines *Ls*\n starting with a single [blank line] constitute a (possibly empty)\n sequence of blocks *Bs*, and *M* is a list marker of width *W*," -- " then the result of prepending *M* to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs*" -- " as its contents.\n If a line is empty, then it need not be indented. The type of the\n list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a" -- "start number, based on the ordered list marker.\n\nHere are some list items that start with a blank line but are not empty:" +- "3. **Item starting with a blank line.** If a sequence of lines *Ls*\n starting with a single [blank line] constitute a (possibly empty)\n sequence of blocks *Bs*, and *M* is a list marker of width *W*,\n then the result of prepending *M*" +- " to the first line of *Ls*, and\n preceding subsequent lines of *Ls* by *W + 1* spaces of indentation, is a\n list item with *Bs* as its contents.\n If a line is empty, then it need not be indented. The type of the" +- "list item (bullet or ordered) is determined by the type of its list\n marker. If the list item is ordered, then it is also assigned a\n start number, based on the ordered list marker." +- "Here are some list items that start with a blank line but are not empty:" - "```````````````````````````````` example\n-\n foo\n-\n ```\n bar\n ```\n-\n baz\n.\n
      \n
    • foo
    • \n
    • \n
      bar\n
      \n
    • \n
    • \n
      baz\n
      \n
    • \n
    \n````````````````````````````````" - "When the list item starts with a blank line, the number of spaces\nfollowing the list marker doesn't change the required indentation:\n\n```````````````````````````````` example\n- \n foo\n.\n
      \n
    • foo
    • \n
    \n````````````````````````````````" - "A list item can begin with at most one blank line.\nIn the following example, `foo` is not part of the list\nitem:\n\n```````````````````````````````` example\n-\n\n foo\n.\n
      \n
    • \n
    \n

    foo

    \n````````````````````````````````" @@ -506,21 +521,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n*\n.\n
      \n
    • \n
    \n````````````````````````````````\n\nHowever, an empty list item cannot interrupt a paragraph:" - "```````````````````````````````` example\nfoo\n*\n\nfoo\n1.\n.\n

    foo\n*

    \n

    foo\n1.

    \n````````````````````````````````" - "4. **Indentation.** If a sequence of lines *Ls* constitutes a list item\n according to rule #1, #2, or #3, then the result of preceding each line\n of *Ls* by up to three spaces of indentation (the same for each line) also" -- " constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented two spaces:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndented three spaces:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nFour spaces indent gives a code block:" +- "constitutes a list item with the same contents and attributes. If a line is\n empty, then it need not be indented.\n\nIndented one space:" +- "```````````````````````````````` example" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nIndented two spaces:" +- "```````````````````````````````` example" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nIndented three spaces:" +- "```````````````````````````````` example" +- " 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nFour spaces indent gives a code block:" - "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
    1.  A paragraph\n    with two lines.\n\n        indented code\n\n    > A block quote.\n
    " - "````````````````````````````````" - "5. **Laziness.** If a string of lines *Ls* constitute a [list\n item](#list-items) with contents *Bs*, then the result of deleting\n some or all of the indentation from one or more lines in which the" -- " next character other than a space or tab after the indentation is\n [paragraph continuation text] is a\n list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s." +- "next character other than a space or tab after the indentation is\n [paragraph continuation text] is a\n list item with the same contents and attributes. The unindented\n lines are called\n [lazy continuation line](@)s." - "Here is an example with [lazy continuation lines]:" -- "```````````````````````````````` example\n 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      " -- "
      \n
    2. \n
    \n````````````````````````````````\n\n\nIndentation can be partially deleted:" -- "```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````\n\n\nThese examples show how laziness can work in nested structures:" +- "```````````````````````````````` example" +- " 1. A paragraph\nwith two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    " +- "````````````````````````````````\n\n\nIndentation can be partially deleted:\n\n```````````````````````````````` example\n 1. A paragraph\n with two lines.\n.\n
      \n
    1. A paragraph\nwith two lines.
    2. \n
    \n````````````````````````````````" +- "These examples show how laziness can work in nested structures:" - "```````````````````````````````` example\n> 1. > Blockquote\ncontinued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````" - "```````````````````````````````` example\n> 1. > Blockquote\n> continued here.\n.\n
    \n
      \n
    1. \n
      \n

      Blockquote\ncontinued here.

      \n
      \n
    2. \n
    \n
    \n````````````````````````````````" - "6. **That's all.** Nothing that is not counted as a list item by rules\n #1--5 counts as a [list item](#list-items)." @@ -533,8 +552,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n1. - 2. foo\n.\n
      \n
    1. \n
        \n
      • \n
          \n
        1. foo
        2. \n
        \n
      • \n
      \n
    2. \n
    \n````````````````````````````````\n\n\nA list item can contain a heading:" - "```````````````````````````````` example\n- # Foo\n- Bar\n ---\n baz\n.\n
      \n
    • \n

      Foo

      \n
    • \n
    • \n

      Bar

      \nbaz
    • \n
    \n````````````````````````````````" - "### Motivation\n\nJohn Gruber's Markdown spec says the following about list items:" -- "1. \"List markers typically start at the left margin, but may be indented\n by up to three spaces. List markers must be followed by one or more\n spaces or a tab.\"\n\n2. \"To make lists look nice, you can wrap items with hanging indents...." -- " But if you don't want to, you don't have to.\"\n\n3. \"List items may consist of multiple paragraphs. Each subsequent\n paragraph in a list item must be indented by either 4 spaces or one\n tab.\"" +- "1. \"List markers typically start at the left margin, but may be indented\n by up to three spaces. List markers must be followed by one or more\n spaces or a tab.\"" +- "2. \"To make lists look nice, you can wrap items with hanging indents....\n But if you don't want to, you don't have to.\"" +- "3. \"List items may consist of multiple paragraphs. Each subsequent\n paragraph in a list item must be indented by either 4 spaces or one\n tab.\"" - "4. \"It looks nice if you indent every line of the subsequent paragraphs,\n but here again, Markdown will allow you to be lazy.\"\n\n5. \"To put a blockquote within a list item, the blockquote's `>`\n delimiters need to be indented.\"" - "6. \"To put a code block within a list item, the code block needs to be\n indented twice — 8 spaces or two tabs.\"" - "These rules specify that a paragraph under a list item must be indented\nfour spaces (presumably, from the left margin, rather than the start of\nthe list marker, but this is not said), and that code under a list item" @@ -554,8 +574,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Would it help to adopt a two-space rule? The problem is that such\na rule, together with the rule allowing up to three spaces of indentation for\nthe initial list marker, allows text that is indented *less than* the" - "original list marker to be included in the list item. For example,\n`Markdown.pl` parses\n\n``` markdown\n - one\n\n two\n```\n\nas a single list item, with `two` a continuation paragraph:\n\n``` html\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n```\n\nand similarly" - "``` markdown\n> - one\n>\n> two\n```\n\nas\n\n``` html\n
    \n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n
    \n```\n\nThis is extremely unintuitive." -- "Rather than requiring a fixed indent from the margin, we could require\na fixed indent (say, two spaces, or even one space) from the list marker (which\nmay itself be indented). This proposal would remove the last anomaly\ndiscussed." -- "Unlike the spec presented above, it would count the following\nas a list item with a subparagraph, even though the paragraph `bar`\nis not indented as far as the first paragraph `foo`:\n\n``` markdown\n 10. foo\n\n bar \n```" +- "Rather than requiring a fixed indent from the margin, we could require\na fixed indent (say, two spaces, or even one space) from the list marker (which\nmay itself be indented). This proposal would remove the last anomaly" +- "discussed. Unlike the spec presented above, it would count the following\nas a list item with a subparagraph, even though the paragraph `bar`\nis not indented as far as the first paragraph `foo`:\n\n``` markdown\n 10. foo\n\n bar \n```" - "Arguably this text does read like a list item with `bar` as a subparagraph,\nwhich may count in favor of the proposal. However, on this proposal indented\ncode would have to be indented six spaces after the list marker. And this" - "would break a lot of existing Markdown, which has the pattern:\n\n``` markdown\n1. foo\n\n indented code\n```" - "where the code is indented eight spaces. The spec above, by contrast, will\nparse this text as expected, since the code block's indentation is measured\nfrom the beginning of `foo`." @@ -563,8 +583,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that in such cases, we require one space indentation from the list marker\n(and then the normal four spaces for the indented code). This will match the\nfour-space rule in cases where the list marker plus its initial indentation" - "takes four spaces (a common case), but diverge in other cases." - "## Lists\n\nA [list](@) is a sequence of one or more\nlist items [of the same type]. The list items\nmay be separated by any number of blank lines." -- "Two list items are [of the same type](@)\nif they begin with a [list marker] of the same type.\nTwo list markers are of the\nsame type if (a) they are bullet list markers using the same character" -- "(`-`, `+`, or `*`) or (b) they are ordered list numbers with the same\ndelimiter (either `.` or `)`)." +- "Two list items are [of the same type](@)\nif they begin with a [list marker] of the same type.\nTwo list markers are of the\nsame type if (a) they are bullet list markers using the same character\n(`-`, `+`, or `*`" +- ") or (b) they are ordered list numbers with the same\ndelimiter (either `.` or `)`)." - "A list is an [ordered list](@)\nif its constituent list items begin with\n[ordered list markers], and a\n[bullet list](@) if its constituent list\nitems begin with [bullet list markers]." - "The [start number](@)\nof an [ordered list] is determined by the list number of\nits initial list item. The numbers of subsequent list items are\ndisregarded." - "A list is [loose](@) if any of its constituent\nlist items are separated by blank lines, or if any of its constituent\nlist items directly contain two block-level elements with a blank line\nbetween them. Otherwise a list is [tight](@)." @@ -617,9 +637,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n`hi`lo`\n.\n

    hilo`

    \n````````````````````````````````\n\n`hi` is parsed as code, leaving the backtick at the end as a literal\nbacktick." - "## Code spans\n\nA [backtick string](@)\nis a string of one or more backtick characters (`` ` ``) that is neither\npreceded nor followed by a backtick." - "A [code span](@) begins with a backtick string and ends with\na backtick string of equal length. The contents of the code span are\nthe characters between these two backtick strings, normalized in the\nfollowing ways:" -- "- First, [line endings] are converted to [spaces].\n- If the resulting string both begins *and* ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back." -- "This allows you to include code that begins\n or ends with backtick characters, which must be separated by\n whitespace from the opening or closing backtick strings.\n\nThis is a simple code span:" -- "```````````````````````````````` example\n`foo`\n.\n

    foo

    \n````````````````````````````````\n\n\nHere two backticks are used, because the code contains a backtick.\nThis example also illustrates stripping of a single leading and\ntrailing space:" +- "- First, [line endings] are converted to [spaces]." +- "- If the resulting string both begins *and* ends with a [space]\n character, but does not consist entirely of [space]\n characters, a single [space] character is removed from the\n front and back. This allows you to include code that begins" +- "or ends with backtick characters, which must be separated by\n whitespace from the opening or closing backtick strings.\n\nThis is a simple code span:\n\n```````````````````````````````` example\n`foo`\n.\n

    foo

    \n````````````````````````````````" +- "Here two backticks are used, because the code contains a backtick.\nThis example also illustrates stripping of a single leading and\ntrailing space:" - "```````````````````````````````` example\n`` foo ` bar ``\n.\n

    foo ` bar

    \n````````````````````````````````\n\n\nThis example shows the motivation for stripping leading and trailing\nspaces:" - "```````````````````````````````` example\n` `` `\n.\n

    ``

    \n````````````````````````````````\n\nNote that only *one* space is stripped:" - "```````````````````````````````` example\n` `` `\n.\n

    ``

    \n````````````````````````````````\n\nThe stripping only happens if the space is on both\nsides of the string:" @@ -651,40 +672,49 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` markdown\ninternal emphasis: foo*bar*baz\nno emphasis: foo_bar_baz\n```\n\nThe rules given below capture all of these patterns, while allowing\nfor efficient parsing strategies that do not backtrack." - "First, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by" - "a non-backslash-escaped `_` character." -- "A [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and" -- "preceded by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace." -- "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and" -- "followed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." -- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```" -- " abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" +- "A [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace." +- "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [" +- "Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." +- " - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```" +- " - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's" -- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags)" -- ".\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]" -- " and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character]." -- "3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run]." -- "4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]" -- " followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run]." -- "6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]" -- " preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run]." -- "8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]" -- "followed by a [Unicode punctuation character]." -- "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The" -- " opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters" -- "must not be a multiple of 3 unless both lengths are\n multiples of 3." -- "10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The" -- " opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing" +- "[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\"" +- " and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run]." +- "2." +- "A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run]." +- "4." +- "A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run]." +- "6." +- "A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [" +- "Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run]." +- "8." +- "A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [" +- "Unicode punctuation character]." +- "9." +- "Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate" +- "[delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are" +- multiples of 3. +- "10." +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The" +- "opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing" - "delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped." - "12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always" -- "preferred to `...`." -- "15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example," -- " `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`." -- "16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example," -- " `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`." -- "17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins." -- "Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`." +- "14. An interpretation `...` is always\n preferred to `...`." +- "15." +- "When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_`" +- " rather\n than `*foo bar* baz`." +- "16." +- "When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`" +- "rather than `foo **bar baz`." +- "17." +- "Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example," +- "`*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:" - "```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:" - "```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" - "This is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:" @@ -805,20 +835,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "A link contains [link text] (the visible text), a [link destination]\n(the URI that is the link destination), and optionally a [link title].\nThere are two basic kinds of links in Markdown. In [inline links] the" - "destination and title are given immediately after the link text. In\n[reference links] the destination and title are defined elsewhere in\nthe document." - "A [link text](@) consists of a sequence of zero or more\ninline elements enclosed by square brackets (`[` and `]`). The\nfollowing rules apply:" -- "- Links may not contain other links, at any level of nesting. If\n multiple otherwise valid link definitions appear nested inside each\n other, the inner-most definition is used.\n\n- Brackets are allowed in the [link text] only if (a) they" -- " are backslash-escaped or (b) they appear as a matched pair of brackets,\n with an open bracket `[`, a sequence of zero or more inlines, and\n a close bracket `]`." +- "- Links may not contain other links, at any level of nesting. If\n multiple otherwise valid link definitions appear nested inside each\n other, the inner-most definition is used." +- "- Brackets are allowed in the [link text] only if (a) they\n are backslash-escaped or (b) they appear as a matched pair of brackets,\n with an open bracket `[`, a sequence of zero or more inlines, and\n a close bracket `]`." - "- Backtick [code spans], [autolinks], and raw [HTML tags] bind more tightly\n than the brackets in link text. Thus, for example,\n `` [foo`]` `` could not be a link text, since the second `]`\n is part of a code span." - "- The brackets in link text bind more tightly than markers for\n [emphasis and strong emphasis]. Thus, for example, `*[foo*](url)` is a link.\n\nA [link destination](@) consists of either" -- "- a sequence of zero or more characters between an opening `<` and a\n closing `>` that contains no line endings or unescaped\n `<` or `>` characters, or\n\n- a nonempty sequence of characters that does not start with `<`," -- " does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are\n backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses." -- "(Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)\n\nA [link title](@) consists of either" -- "- a sequence of zero or more characters between straight double-quote\n characters (`\"`), including a `\"` character only if it is\n backslash-escaped, or\n\n- a sequence of zero or more characters between straight single-quote" -- " characters (`'`), including a `'` character only if it is\n backslash-escaped, or\n\n- a sequence of zero or more characters between matching parentheses\n (`(...)`), including a `(` or `)` character only if it is\n backslash-escaped." +- "- a sequence of zero or more characters between an opening `<` and a\n closing `>` that contains no line endings or unescaped\n `<` or `>` characters, or" +- "-" +- "a nonempty sequence of characters that does not start with `<`,\n does not include [ASCII control characters][ASCII control character]\n or [space] character, and includes parentheses only if (a) they are" +- "backslash-escaped or (b) they are part of a balanced pair of\n unescaped parentheses.\n (Implementations may impose limits on parentheses nesting to\n avoid performance issues, but at least three levels of nesting\n should be supported.)" +- "A [link title](@) consists of either" +- "- a sequence of zero or more characters between straight double-quote\n characters (`\"`), including a `\"` character only if it is\n backslash-escaped, or" +- "- a sequence of zero or more characters between straight single-quote\n characters (`'`), including a `'` character only if it is\n backslash-escaped, or" +- "- a sequence of zero or more characters between matching parentheses\n (`(...)`), including a `(` or `)` character only if it is\n backslash-escaped." - "Although [link titles] may span multiple lines, they may not contain\na [blank line]." - "An [inline link](@) consists of a [link text] followed immediately\nby a left parenthesis `(`, an optional [link destination], an optional\n[link title], and a right parenthesis `)`.\nThese four components may be separated by spaces, tabs, and up to one line" - "ending.\nIf both [link destination] and [link title] are present, they *must* be\nseparated by spaces, tabs, and up to one line ending." -- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>` if present, with backslash-escapes in effect as described\nabove." -- "The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:" +- "The link's text consists of the inlines contained\nin the [link text] (excluding the enclosing square brackets).\nThe link's URI consists of the link destination, excluding enclosing\n`<...>` if present, with backslash-escapes in effect as described" +- "above. The link's title consists of the link title, excluding its\nenclosing delimiters, with backslash-escapes in effect as described\nabove.\n\nHere is a simple inline link:" - "```````````````````````````````` example\n[link](/uri \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nThe title, the link text and even \nthe destination may be omitted:" - "```````````````````````````````` example\n[link](/uri)\n.\n

    link

    \n````````````````````````````````\n\n```````````````````````````````` example\n[](./target.md)\n.\n

    \n````````````````````````````````" - "```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[link](<>)\n.\n

    link

    \n````````````````````````````````" @@ -835,12 +868,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n[link](foo\\(and\\(bar\\))\n.\n

    link

    \n````````````````````````````````" - "```````````````````````````````` example\n[link]()\n.\n

    link

    \n````````````````````````````````\n\n\nParentheses and other symbols can also be escaped, as usual\nin Markdown:" - "```````````````````````````````` example\n[link](foo\\)\\:)\n.\n

    link

    \n````````````````````````````````\n\n\nA link can contain fragment identifiers and queries:" -- "```````````````````````````````` example\n[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    " -- "

    link

    \n````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:" -- "```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" -- "URL-escaping should be left alone inside the destination, as all\nURL-escaped characters are also valid URL characters. Entity and\nnumerical character references in the destination will be parsed\ninto the corresponding Unicode code points, as usual." -- "These may\nbe optionally URL-escaped when written as HTML, but this spec\ndoes not enforce any particular policy for rendering URLs in\nHTML or other formats. Renderers may make different decisions\nabout how to escape or normalize URLs in the output." -- "```````````````````````````````` example\n[link](foo%20bä)\n.\n

    link

    \n````````````````````````````````" +- "```````````````````````````````` example" +- "[link](#fragment)\n\n[link](https://example.com#fragment)\n\n[link](https://example.com?foo=3#frag)\n.\n

    link

    \n

    link

    \n

    link

    " +- "````````````````````````````````\n\n\nNote that a backslash before a non-escapable character is\njust a backslash:\n\n```````````````````````````````` example\n[link](foo\\bar)\n.\n

    link

    \n````````````````````````````````" +- "URL-escaping should be left alone inside the destination, as all\nURL-escaped characters are also valid URL characters. Entity and\nnumerical character references in the destination will be parsed" +- "into the corresponding Unicode code points, as usual. These may\nbe optionally URL-escaped when written as HTML, but this spec\ndoes not enforce any particular policy for rendering URLs in\nHTML or other formats. Renderers may make different decisions" +- "about how to escape or normalize URLs in the output.\n\n```````````````````````````````` example\n[link](foo%20bä)\n.\n

    link

    \n````````````````````````````````" - "Note that, because titles can often be parsed as destinations,\nif you try to omit the destination and keep the title, you'll\nget unexpected results:" - "```````````````````````````````` example\n[link](\"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nTitles may be in single quotes, double quotes, or parentheses:" - "```````````````````````````````` example\n[link](/url \"title\")\n[link](/url 'title')\n[link](/url (title))\n.\n

    link\nlink\nlink

    " @@ -850,10 +883,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n[link](/url \"title\")\n.\n

    link

    \n````````````````````````````````\n\n\nNested balanced quotes are not allowed without escaping:" - "```````````````````````````````` example\n[link](/url \"title \"and\" title\")\n.\n

    [link](/url "title "and" title")

    \n````````````````````````````````\n\n\nBut it is easy to work around this by using a different quote type:" - "```````````````````````````````` example\n[link](/url 'title \"and\" title')\n.\n

    link

    \n````````````````````````````````" -- "(Note: `Markdown.pl` did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this" -- "brings, since there are already many ways---backslash escaping,\nentity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`" -- "'s handling of titles has a number\nof other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`." -- "`Markdown.pl` 1.0.1 even allows\ntitles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)" +- "(Note: `Markdown.pl` did allow double quotes inside a double-quoted\ntitle, and its test suite included a test demonstrating this.\nBut it is hard to see a good rationale for the extra complexity this\nbrings, since there are already many ways---" +- "backslash escaping,\nentity and numeric character references, or using a different\nquote type for the enclosing title---to write titles containing\ndouble quotes. `Markdown.pl`'s handling of titles has a number" +- "of other strange features. For example, it allows single-quoted\ntitles in inline links, but not reference links. And, in\nreference links but not inline links, it allows a title to begin\nwith `\"` and end with `)`. `Markdown.pl` 1.0.1 even allows" +- "titles with no closing quotation mark, though 1.0.2b8 does not.\nIt seems preferable to adopt a simple, rational rule that works\nthe same way in inline links and link reference definitions.)" - "Spaces, tabs, and up to one line ending is allowed around the destination and\ntitle:\n\n```````````````````````````````` example\n[link]( /uri\n \"title\" )\n.\n

    link

    \n````````````````````````````````" - "But it is not allowed between the link text and the\nfollowing parenthesis:\n\n```````````````````````````````` example\n[link] (/uri)\n.\n

    [link] (/uri)

    \n````````````````````````````````" - "The link text may contain balanced brackets, but not unbalanced ones,\nunless they are escaped:\n\n```````````````````````````````` example\n[link [foo [bar]]](/uri)\n.\n

    link [foo [bar]]

    \n````````````````````````````````" @@ -908,15 +941,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n[foo][ref\\[]\n\n[ref\\[]: /uri\n.\n

    foo

    \n````````````````````````````````\n\n\nNote that in this example `]` is not backslash-escaped:" - "```````````````````````````````` example\n[bar\\\\]: /uri\n\n[bar\\\\]\n.\n

    bar\\

    \n````````````````````````````````\n\n\nA [link label] must contain at least one character that is not a space, tab, or\nline ending:" - "```````````````````````````````` example\n[]\n\n[]: /uri\n.\n

    []

    \n

    []: /uri

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [\n]: /uri

    \n````````````````````````````````" -- "A [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link's text." -- "The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`." +- "A [collapsed reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument, followed by the string `[]`.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link'" +- "s text. The link's URI and title are\nprovided by the matching reference link definition. Thus,\n`[foo][]` is equivalent to `[foo][foo]`." - "```````````````````````````````` example\n[foo][]\n\n[foo]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````" - "```````````````````````````````` example\n[*foo* bar][]\n\n[*foo* bar]: /url \"title\"\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThe link labels are case-insensitive:" - "```````````````````````````````` example\n[Foo][]\n\n[foo]: /url \"title\"\n.\n

    Foo

    \n````````````````````````````````" - "As with full reference links, spaces, tabs, or line endings are not\nallowed between the two sets of brackets:" - "```````````````````````````````` example\n[foo] \n[]\n\n[foo]: /url \"title\"\n.\n

    foo\n[]

    \n````````````````````````````````" -- "A [shortcut reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument and is not followed by `[]` or a link label.\nThe contents of the link label are parsed as inlines," -- "which are used as the link's text. The link's URI and title\nare provided by the matching link reference definition.\nThus, `[foo]` is equivalent to `[foo][]`." +- "A [shortcut reference link](@)\nconsists of a [link label] that [matches] a\n[link reference definition] elsewhere in the\ndocument and is not followed by `[]` or a link label.\nThe contents of the link label are parsed as inlines,\nwhich are used as the link'" +- "s text. The link's URI and title\nare provided by the matching link reference definition.\nThus, `[foo]` is equivalent to `[foo][]`." - "```````````````````````````````` example\n[foo]\n\n[foo]: /url \"title\"\n.\n

    foo

    \n````````````````````````````````" - "```````````````````````````````` example\n[*foo* bar]\n\n[*foo* bar]: /url \"title\"\n.\n

    foo bar

    \n````````````````````````````````" - "```````````````````````````````` example\n[[*foo* bar]]\n\n[*foo* bar]: /url \"title\"\n.\n

    [foo bar]

    \n````````````````````````````````" @@ -940,8 +973,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n![foo *bar*]\n\n[foo *bar*]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "```````````````````````````````` example\n![foo ![bar](/url)](/url2)\n.\n

    \"foo

    \n````````````````````````````````" - "```````````````````````````````` example\n![foo [bar](/url)](/url2)\n.\n

    \"foo

    \n````````````````````````````````" -- "Though this spec is concerned with parsing, not rendering, it is\nrecommended that in rendering to HTML, only the plain string content\nof the [image description] be used. Note that in\nthe above example, the alt attribute's value is `foo bar`, not `foo" -- "[bar](/url)` or `foo bar`. Only the plain string\ncontent is rendered, without formatting." +- "Though this spec is concerned with parsing, not rendering, it is\nrecommended that in rendering to HTML, only the plain string content\nof the [image description] be used. Note that in\nthe above example, the alt attribute's value is `foo bar`, not" +- "`foo\n[bar](/url)` or `foo bar`. Only the plain string\ncontent is rendered, without formatting." - "```````````````````````````````` example\n![foo *bar*][]\n\n[foo *bar*]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "```````````````````````````````` example\n![foo *bar*][foobar]\n\n[FOOBAR]: train.jpg \"train & tracks\"\n.\n

    \"foo

    \n````````````````````````````````" - "```````````````````````````````` example\n![foo](train.jpg)\n.\n

    \"foo\"

    \n````````````````````````````````" @@ -1032,8 +1065,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\nfoo \n.\n

    foo

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n### foo\\\n.\n

    foo\\

    \n````````````````````````````````" - "```````````````````````````````` example\n### foo \n.\n

    foo

    \n````````````````````````````````" - "## Soft line breaks" -- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@). (A soft line break may be rendered in HTML either as a\n[line ending] or as a space." -- "The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)\n\n```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````" +- "A regular line ending (not in a code span or HTML tag) that is not\npreceded by two or more spaces or a backslash is parsed as a\n[softbreak](@). (A soft line break may be rendered in HTML either as a\n[line ending]" +- " or as a space. The result will be the same in\nbrowsers. In the examples here, a [line ending] will be used.)\n\n```````````````````````````````` example\nfoo\nbaz\n.\n

    foo\nbaz

    \n````````````````````````````````" - "Spaces at the end of the line and beginning of the next line are\nremoved:\n\n```````````````````````````````` example\nfoo \n baz\n.\n

    foo\nbaz

    \n````````````````````````````````" - "A conforming parser may render a soft line break in HTML either as a\nline ending or as a space.\n\nA renderer may also provide an option to render soft line breaks\nas hard line breaks." - "## Textual content\n\nAny characters not given an interpretation by the above rules will\nbe parsed as plain textual content.\n\n```````````````````````````````` example\nhello $.;'there\n.\n

    hello $.;'there

    \n````````````````````````````````" @@ -1041,20 +1074,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\nMultiple spaces\n.\n

    Multiple spaces

    \n````````````````````````````````\n\n\n" - "# Appendix: A parsing strategy\n\nIn this appendix we describe some features of the parsing strategy\nused in the CommonMark reference implementations." - "## Overview\n\nParsing has two phases:" -- "1. In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these\nblocks but not parsed." -- "Link reference definitions are parsed and a\nmap of links is constructed." +- "1." +- "In the first phase, lines of input are consumed and the block\nstructure of the document---its division into paragraphs, block quotes,\nlist items, and so on---is constructed. Text is assigned to these" +- "blocks but not parsed. Link reference definitions are parsed and a\nmap of links is constructed." - "2. In the second phase, the raw text contents of paragraphs and headings\nare parsed into sequences of Markdown inline elements (strings,\ncode spans, links, emphasis, and so on), using the map of link\nreferences constructed in phase 1." -- "At each point in processing, the document is represented as a tree of\n**blocks**. The root of the tree is a `document` block. The `document`\nmay have any number of other blocks as **children**. These children\nmay, in turn, have other blocks as children." -- "The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**.)\nHere, for example, is a possible document tree, with the open blocks\nmarked by arrows:" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item" -- " -> paragraph\n \"aliquando id\"\n```" +- "At each point in processing, the document is represented as a tree of\n**blocks**. The root of the tree is a `document` block. The `document`\nmay have any number of other blocks as **children**. These children" +- "may, in turn, have other blocks as children. The last child of a block\nis normally considered **open**, meaning that subsequent lines of input\ncan alter its contents. (Blocks that are not open are **closed**.)" +- "Here, for example, is a possible document tree, with the open blocks\nmarked by arrows:" +- "``` tree" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph" +- " \"aliquando id\"\n```" - "## Phase 1: block structure\n\nEach line that is processed has an effect on this tree. The line is\nanalyzed and, depending on its contents, the document may be altered\nin one or more of the following ways:" - "1. One or more open blocks may be closed.\n2. One or more new blocks may be created as children of the\n last open block.\n3. Text may be added to the last (deepest) open block remaining\n on the tree." - "Once a line has been incorporated into the tree in this way,\nit can be discarded, so input can be read in a stream.\n\nFor each line, we follow this procedure:" -- "1. First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy\nif the block is to remain open." -- "For example, a block quote requires a\n`>` character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[lazy continuation line]." -- "2. Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>` for a block quote).\nIf we encounter a new block start, we close any blocks unmatched" -- "in step 1 before creating the new block as a child of the last\nmatched container block." +- "1." +- "First we iterate through the open blocks, starting with the\nroot document, and descending through last children down to the last\nopen block. Each block imposes a condition that the line must satisfy" +- "if the block is to remain open. For example, a block quote requires a\n`>` character. A paragraph requires a non-blank line.\nIn this phase we may match all or just some of the open\nblocks. But we cannot close unmatched blocks yet, because we may have a\n[" +- "lazy continuation line]." +- "2." +- "Next, after consuming the continuation markers for existing\nblocks, we look for new block starts (e.g. `>` for a block quote).\nIf we encounter a new block start, we close any blocks unmatched\nin step 1 before creating the new block as a child of the last" +- matched container block. - "3. Finally, we look at the remainder of the line (after block\nmarkers like `>`, list markers, and indentation have been consumed).\nThis is text that can be incorporated into the last open\nblock (a paragraph, code block, heading, or raw HTML)." - "Setext headings are formed when we see a line of a paragraph\nthat is a [setext heading underline]." - "Reference link definitions are detected when a paragraph is closed;\nthe accumulated text lines are parsed to see if they begin with\none or more reference link definitions. Any remainder becomes a\nnormal paragraph." @@ -1063,29 +1102,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "causes a `block_quote` block to be created as a child of our\nopen `document` block, and a `paragraph` block as a child of\nthe `block_quote`. Then the text is added to the last open\nblock, the `paragraph`:" - "``` tree\n-> document\n -> block_quote\n -> paragraph\n \"Lorem ipsum dolor\"\n```\n\nThe next line,\n\n``` markdown\nsit amet.\n```\n\nis a \"lazy continuation\" of the open `paragraph`, so it gets added\nto the paragraph's text:" - "``` tree\n-> document\n -> block_quote\n -> paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n```\n\nThe third line,\n\n``` markdown\n> - Qui *quodsi iracundia*\n```" -- "causes the `paragraph` block to be closed, and a new `list` block\nopened as a child of the `block_quote`. A `list_item` is also\nadded as a child of the `list`, and a `paragraph` as a child of\nthe `list_item`." -- "The text is then added to the new `paragraph`:" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n -> list_item\n -> paragraph\n \"Qui *quodsi iracundia*\"\n```\n\nThe fourth line," +- "causes the `paragraph` block to be closed, and a new `list` block\nopened as a child of the `block_quote`. A `list_item` is also\nadded as a child of the `list`, and a `paragraph` as a child of\nthe `list_item`. The text is then added to the new `paragraph`" +- ":\n\n``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n -> list_item\n -> paragraph\n \"Qui *quodsi iracundia*\"\n```\n\nThe fourth line," - "``` markdown\n> - aliquando id\n```" - "causes the `list_item` (and its child the `paragraph`) to be closed,\nand a new `list_item` opened up as child of the `list`. A `paragraph`\nis added as a child of the new `list_item`, to contain the text.\nWe thus obtain the final tree:" -- "``` tree\n-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item" -- " -> paragraph\n \"aliquando id\"\n```" +- "``` tree" +- "-> document\n -> block_quote\n paragraph\n \"Lorem ipsum dolor\\nsit amet.\"\n -> list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n \"Qui *quodsi iracundia*\"\n -> list_item\n -> paragraph" +- " \"aliquando id\"\n```" - "## Phase 2: inline structure\n\nOnce all of the input has been parsed, all open blocks are closed." - "We then \"walk the tree,\" visiting every node, and parse raw\nstring contents of paragraphs and headings as inlines. At this\npoint we have seen all the link reference definitions, so we can\nresolve reference links as we go." -- "``` tree\ndocument\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph" +- "``` tree" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n softbreak\n str \"sit amet.\"\n list (type=bullet tight=true bullet_char=-)\n list_item\n paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item\n paragraph\n str \"aliquando id\"\n```\n\nNotice how the [line ending] in the first paragraph has\nbeen parsed as a `softbreak`, and the asterisks in the first list item\nhave become an `emph`." - "### An algorithm for parsing nested emphasis and links\n\nBy far the trickiest part of inline parsing is handling emphasis,\nstrong emphasis, links, and images. This is done using the following\nalgorithm.\n\nWhen we're parsing inlines and we hit either" - "- a run of `*` or `_` characters, or\n- a `[` or `![`" - "we insert a text node with these symbols as its literal content, and we\nadd a pointer to this text node to the [delimiter stack](@).\n\nThe [delimiter stack] is a doubly linked list. Each\nelement contains a pointer to a text node, plus information about" -- "- the type of delimiter (`[`, `![`, `*`, `_`)\n- the number of delimiters,\n- whether the delimiter is \"active\" (all are active to start), and\n- whether the delimiter is a potential opener, a potential closer," -- " or both (which depends on what sort of characters precede\n and follow the delimiters).\n\nWhen we hit a `]` character, we call the *look for link or image*\nprocedure (see below)." -- "When we hit the end of the input, we call the *process emphasis*\nprocedure (see below), with `stack_bottom` = NULL." +- "- the type of delimiter (`[`, `![`, `*`, `_`)\n- the number of delimiters,\n- whether the delimiter is \"active\" (all are active to start), and" +- "- whether the delimiter is a potential opener, a potential closer,\n or both (which depends on what sort of characters precede\n and follow the delimiters)." +- "When we hit a `]` character, we call the *look for link or image*\nprocedure (see below).\n\nWhen we hit the end of the input, we call the *process emphasis*\nprocedure (see below), with `stack_bottom` = NULL." - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter." -- "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if" -- " we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image." -- "+ If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines" -- " after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter." -- " * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)" +- "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`." +- "- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image." +- "+ If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`." +- "+ If we do, then" +- "* We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`." +- " * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`." - "Let `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL)." - "We keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`." @@ -1094,8 +1135,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:" - "+ Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener." - " + Remove any delimiters between the opener and closer from\n the delimiter stack." -- " + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack." -- "If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:" +- + +- "Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element" +- "of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:" - "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)" - " + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack." - "After we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap index 1828f1d..345adb0 100644 --- a/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__markdown_trim@commonmark_spec.md.snap @@ -19,8 +19,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - org/licenses/by- - "sa/4.0/)'\n..." - "# Introduction" -- "## What is" -- Markdown? +- "##" +- What is Markdown +- "?" - Markdown is a - plain text - format for @@ -48,9 +49,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - et/projects/ - markdown/syntax) - and a Perl -- "script (`" -- "Markdown.pl`)" -- for converting +- script ( +- "`Markdown.pl`" +- ) for converting - Markdown to - HTML. - In the next @@ -102,10 +103,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - readability. - As Gruber writes - ":" -- "> The overriding" +- ">" +- The overriding - design goal for -- "Markdown's" -- formatting +- "Markdown'" +- s formatting - "syntax is\n>" - to make it as - readable as @@ -119,12 +121,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as-is, as\n>" - "plain text," - without looking -- "like it's been" -- marked up with -- "tags\n>" +- "like it'" +- s been marked up +- "with tags\n>" - or formatting - "instructions.\n>" -- "() @@ -143,7 +146,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - AsciiDoc from - the AsciiDoc - "manual:" -- "```\n1." +- "```" +- "1." - "List item one.\n+" - List item one - continued with a @@ -185,7 +189,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - And here is the - equivalent in - "Markdown:" -- "```\n1." +- "```" +- "1." - List item one. - List item one - continued with a @@ -237,13 +242,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "source, not just" - in the processed - document. -- "## Why is a spec" +- "##" +- Why is a spec - needed? -- "John Gruber's [" +- "John Gruber's" +- "[" - canonical - description of - "Markdown's" -- "syntax](https://" +- syntax +- "](https://" - daringfireball.n - et/projects/ - markdown/syntax) @@ -273,8 +281,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - think that - "they, too, must" - be indented four -- "spaces, but `" -- "Markdown.pl`" +- "spaces, but" +- "`Markdown.pl`" - does - not require that - "." @@ -288,7 +296,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - surprises for - users in real - documents. (See -- "[this comment by" +- "[" +- this comment by - "John\n Gruber" - "](https://" - web.archive.org/ @@ -326,8 +335,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - while others do - not). - (John Gruber has -- "also spoken [in" -- favor of +- also spoken +- "[" +- in favor of - requiring the - "blank\n lines" - "](https://" @@ -512,7 +522,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Can list items - be empty? - "``` markdown" -- " * a\n *" +- "* a\n *" - "* b\n ```" - "13." - Can link @@ -540,13 +550,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In the absence - "of a spec, early" - implementers -- "consulted `" -- "Markdown.pl`" +- consulted +- "`Markdown.pl`" - to resolve these - ambiguities. -- "But `Markdown.pl" -- "` was quite" -- "buggy, and" +- But +- "`Markdown.pl`" +- "was quite buggy," +- and - gave manifestly - bad results in - "many cases, so" @@ -579,12 +590,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "worse, because" - nothing in - Markdown counts -- "as a \"syntax" -- "error,\" the" -- divergence often -- "isn't discovered" +- "as a \"" +- "syntax error,\"" +- the divergence +- "often isn'" +- t discovered - right away. -- "## About this" +- "##" +- About this - document - This document - attempts to @@ -602,8 +615,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - conformance - tests. An - accompanying -- "script `" -- "spec_tests.py`" +- script +- "`spec_tests.py`" - can be used to - run the tests - against any @@ -655,9 +668,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - spec says what - counts as a link - "destination, but" -- "it doesn't" -- mandate that non -- "-ASCII" +- "it doesn'" +- t mandate that +- non-ASCII - characters in - the URL be - percent-encoded. @@ -687,20 +700,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - URLs. - This document is - generated from a -- "text file, `" -- "spec.txt`," -- written +- "text file," +- "`spec.txt`" +- ", written" - in Markdown with - a small - extension for - the side-by-side - tests. -- "The script `" -- tools/ -- "makespec.py` can" -- be used to -- "convert `" -- "spec.txt` into" +- The script +- "`tools/" +- "makespec.py`" +- can be used to +- convert +- "`spec.txt` into" - HTML or - CommonMark ( - which can then @@ -713,11 +726,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - used to - represent tabs. - "# Preliminaries" -- "## Characters" -- and lines +- "##" +- Characters and +- lines - Any sequence of -- "[characters] is" -- a valid +- "[characters]" +- is a valid - CommonMark - document. - "A [character](@)" @@ -749,22 +763,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - limited - to a certain - encoding. -- "A [line](@) is a" -- sequence of zero -- "or more [" +- "A [line](@)" +- is a sequence of +- "zero or more [" - "characters]" - other than line -- "feed (`U+000A`)" -- or carriage +- "feed (`U+000A`" +- ) or carriage - "return (`U+000D`" - "),\nfollowed by a" -- "[line ending] or" -- by the end of +- "[line ending]" +- or by the end of - file. -- "A [line ending](" -- "@) is a line" -- "feed (`U+000A`)," -- a carriage +- A +- "[line ending](@)" +- is a line feed ( +- "`U+000A`" +- "), a carriage" - "return\n(`U+000D`" - ) not followed - "by a line feed," @@ -777,28 +792,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "characters, or a" - line containing - "only spaces\n(" -- "`U+0020`) or" -- "tabs (`U+0009`)," -- "is called a [" -- "blank line](@)." +- "`U+0020`" +- ) or tabs ( +- "`U+0009`" +- "), is called a" +- "[blank line](@)." - The following - definitions of - character - classes will be - used in this - "spec:" -- "A [Unicode" +- A +- "[Unicode" - whitespace -- "character](@) is" -- a character in -- "the Unicode `Zs`" -- general +- "character](@)" +- is a character +- in the Unicode +- "`Zs` general" - "category, or a" -- "tab (`U+0009`)," -- "line feed (`U+" -- "000A`), form" -- "feed (`U+000C`)," -- or +- "tab (`U+0009`" +- "), line feed (" +- "`U+000A`" +- "), form feed (" +- "`U+000C`), or" - carriage return - "(`U+000D`)." - "[Unicode" @@ -808,44 +825,50 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Unicode - whitespace - "characters]." -- "A [tab](@) is `U" -- "+0009`." +- "A [tab](@) is" +- "`U+0009`." - "A [space](@) is" - "`U+0020`." -- "An [ASCII" -- control -- "character](@) is" -- a character -- "between `U+0000–" -- "1F` (both" -- "including) or `U" -- "+007F`." -- "An [ASCII" +- An +- "[ASCII control" +- "character](@)" +- is a character +- between +- "`U+0000–1F`" +- (both +- including) or +- "`U+007F`." +- An +- "[ASCII" - punctuation - "character](@)\nis" - "`!`, `\"`, `#`," -- "`$`, `%`, `&`, `" -- "'`, `(`, `)`," -- "`*`, `+`, `,`, `" -- "-`, `.`, `/`" +- "`$`, `%`, `&`," +- "`'`, `(`, `)`," +- "`*`, `+`, `,`," +- "`-`, `.`, `/`" - "(U+0021–2F)," -- "`:`, `;`, `<`, `" -- "=`, `>`, `?`," -- "`@` (U+003A–0040" -- "),\n`[`, `\\`, `]`" -- ", `^`, `_`, `` `" -- "`` (U+005B–0060)" -- ", \n`{`, `|`, `}`" -- ", or `~` (U+007B" -- –007E). -- "A [Unicode" +- "`:`, `;`, `<`," +- "`=`, `>`, `?`," +- "`@`" +- "(U+003A–0040)," +- "`[`, `\\`, `]`," +- "`^`, `_`," +- "`` ` ``" +- "(U+005B–0060)," +- "`{`, `|`, `}`" +- ", or `~`" +- (U+007B–007E). +- A +- "[Unicode" - punctuation -- "character](@) is" -- a character in -- "the Unicode `P`" +- "character](@)" +- is a character +- in the Unicode +- "`P`" - (puncuation) or -- "`S` (symbol)" -- general +- "`S`" +- (symbol) general - categories. - "## Tabs" - Tabs in lines @@ -947,8 +970,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - part of the - content. - In the following -- "case `>` is" -- followed by a +- "case `>`" +- is followed by a - "tab," - which is treated - as if it were @@ -1024,19 +1047,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*→*→*→\n.\n
    " - "````````````````" - "````````````````" -- "## Insecure" +- "##" +- Insecure - characters - For security - "reasons, the" - Unicode -- "character `U+" -- "0000` must be" -- replaced +- character +- "`U+0000`" +- must be replaced - with the - REPLACEMENT -- "CHARACTER (`U+" -- "FFFD`)." -- "## Backslash" +- CHARACTER ( +- "`U+FFFD`)." +- "##" +- Backslash - escapes - Any ASCII - punctuation @@ -1130,8 +1155,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - A backslash at - the end of the -- "line is a [hard" -- "line break]:" +- "line is a [" +- "hard line break]" +- ":" - "````````````````" - "````````````````" - example @@ -1199,8 +1225,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - including URLs - "and link titles," - "link references," -- "and [info" -- "strings] in [" +- "and [" +- "info strings] in" +- "[" - fenced code - "blocks]:" - "````````````````" @@ -1235,7 +1262,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- "## Entity and" +- "##" +- Entity and - numeric - character - references @@ -1253,14 +1281,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - with the - following - "exceptions:" -- "- Entity and" +- "-" +- Entity and - character - references are - not recognized - in code - blocks and code - spans. -- "- Entity and" +- "-" +- Entity and - character - references - cannot stand in @@ -1276,9 +1306,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - can be used - in place of a - "literal `*`" -- "character, `*" -- ";` cannot" -- "replace\n `*`" +- "character," +- "`*`" +- cannot replace +- "`*`" - in emphasis - "delimiters," - bullet list @@ -1302,11 +1333,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "." - "[Entity" - "references](@)" -- "consist of `&` +" -- any of the valid +- "consist of `&`" +- + any of the +- valid - HTML5 entity -- "names + `;`" -- ". The\ndocument" +- "names + `;`. The" +- document - "" - "````````````````" - "````````````````" -- "[Decimal numeric" +- "[" +- Decimal numeric - character - "references](@)" - "consist of `&#`" -- + a string of 1- -- "-7 arabic digits" -- "+ `;`" -- ". A" +- + a string of 1 +- "--" +- 7 arabic digits +- "+ `;`. A" - numeric - character - reference is @@ -1355,15 +1388,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - code points will - be replaced by - the REPLACEMENT -- "CHARACTER (`U+" -- "FFFD`" +- CHARACTER ( +- "`U+FFFD`" - ). - For security - "reasons," -- "the code point `" -- "U+0000` will" -- also be replaced -- "by `U+FFFD`." +- the code point +- "`U+0000`" +- will also be +- replaced by +- "`U+FFFD`." - "````````````````" - "````````````````" - example @@ -1372,17 +1406,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    # Ӓ Ϡ �

    " - "````````````````" - "````````````````" -- "[Hexadecimal" +- "[" +- Hexadecimal - numeric - character - "references](@)" - "consist of `&#`" -- "+\neither `X` or" -- "`x` + a string" -- of 1-6 -- hexadecimal -- "digits + `;`" -- "." +- " +\neither `X` or" +- "`x`" +- + a string of 1- +- 6 hexadecimal +- "digits + `;`." - They too are - parsed as the - corresponding @@ -1430,8 +1464,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - without a - trailing - semicolon (such -- "as `©`)," -- these are not +- "as `©`" +- "), these are not" - "recognized here," - because it makes - the grammar too @@ -1470,10 +1504,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - code spans or - "code blocks," - "including\nURLs," -- "[link titles]," -- "and [fenced code" -- "block][] [info" -- "strings]:" +- "[link titles]" +- ", and [" +- fenced code +- "block][] [" +- "info strings]:" - "````````````````" - "````````````````" - example @@ -1601,7 +1636,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ;tit")

    - "````````````````" - "````````````````" -- "# Blocks and" +- "#" +- Blocks and - inlines - We can think of - a document as a @@ -1624,9 +1660,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - like - headings and - paragraphs) -- "contain [inline]" -- (@) content--- -- "text," +- contain +- "[inline](@)" +- "content---text," - "links," - "emphasized text," - "images, code" @@ -1700,9 +1736,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - affect the - inline parsing - of any other. -- "## Container" -- blocks and leaf -- blocks +- "##" +- Container blocks +- and leaf blocks - We can divide - blocks into two - "types:" @@ -1712,8 +1748,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ")," - which can - contain other -- "blocks, and [" -- "leaf blocks](#" +- "blocks, and" +- "[leaf blocks](#" - "leaf-blocks)," - which cannot. - "# Leaf blocks" @@ -1724,8 +1760,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that make up a - Markdown - document. -- "## Thematic" -- breaks +- "##" +- Thematic breaks - A line - consisting of - optionally up to @@ -1734,8 +1770,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - followed by a - sequence of - three or more -- "matching `-`, `_" -- "`, or `*`" +- "matching `-`," +- "`_`, or `*`" - "characters, each" - followed - optionally by @@ -1920,7 +1956,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - interpreted as - the underline of - "a [setext" -- "heading], the" +- "heading]" +- ", the" - interpretation - "as a\n[" - "setext heading]" @@ -1978,8 +2015,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## ATX headings" -- "An [ATX heading]" -- (@) +- An +- "[ATX heading](@)" - consists of a - string of - "characters," @@ -1987,8 +2024,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "content, between" - an - opening sequence -- of 1--6 -- "unescaped `#`" +- of 1-- +- "6 unescaped `#`" - characters and - an optional - closing sequence @@ -2005,7 +2042,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - line. - The optional - closing sequence -- "of `#`s must be" +- "of `#`" +- s must be - preceded by - spaces or tabs - and may be @@ -2030,8 +2068,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The heading - level is equal - "to the number\nof" -- "`#` characters" -- in the opening +- "`#`" +- characters in +- the opening - sequence. - "Simple headings:" - "````````````````" @@ -2049,8 +2088,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    foo
    " - "````````````````" - "````````````````" -- "More than six `#" -- "` characters is" +- More than six +- "`#`" +- characters is - "not a heading:" - "````````````````" - "````````````````" @@ -2101,8 +2141,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - This is not a - "heading, because" -- "the first `#` is" -- "escaped:" +- "the first `#`" +- "is escaped:" - "````````````````" - "````````````````" - example @@ -2209,12 +2249,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ".\n

    foo

    " - "````````````````" - "````````````````" -- "A sequence of `#" -- "` characters" -- with anything -- but spaces or -- tabs following -- it +- A sequence of +- "`#`" +- characters with +- anything but +- spaces or tabs +- following it - is not a closing - "sequence, but" - counts as part @@ -2295,10 +2335,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    " - "````````````````" - "````````````````" -- "## Setext" -- headings -- "A [setext" -- "heading](@)" +- "##" +- Setext headings +- A +- "[setext heading]" +- (@) - consists of one - or more - "lines of text," @@ -2338,15 +2379,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "list items], or" - "[HTML block][" - "HTML blocks]." -- "A [setext" -- heading -- "underline](@) is" -- a sequence of -- "`=` characters" -- or a sequence of -- "`-` characters," -- with no more -- than 3 +- A +- "[setext heading" +- "underline](@)" +- is a sequence of +- "`=`" +- characters or a +- "sequence of `-`" +- "characters, with" +- no more than 3 - spaces of - indentation and - any number of @@ -2358,8 +2399,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - characters are - "used in\nthe [" - setext heading -- "underline], and" -- a level 2 +- "underline]" +- ", and a level 2" - "heading if `-`" - characters are - used. @@ -2420,8 +2461,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "headings's raw" - content as - inlines. -- "The heading's" -- raw content is +- "The heading'" +- s raw content is - formed by - concatenating - the lines and @@ -2574,11 +2615,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The setext - heading - underline cannot -- "be a [lazy" +- "be a [" +- lazy - continuation -- "line] in a list" -- item or block -- "quote:" +- "line]" +- in a list item +- "or block quote:" - "````````````````" - "````````````````" - example @@ -2615,8 +2657,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - since otherwise - the paragraph - becomes part -- "of the heading's" -- "content:" +- "of the heading'" +- "s content:" - "````````````````" - "````````````````" - example @@ -2697,10 +2739,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - If you want a -- "heading with `>" -- "foo` as its" -- "literal text," -- you can +- heading with +- "`> foo`" +- as its literal +- "text, you can" - use backslash - "escapes:" - "````````````````" @@ -2712,8 +2754,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "**Compatibility" -- "note:** Most" -- existing +- "note:**" +- Most existing - Markdown - implementations - do not allow the @@ -2723,8 +2765,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - But there is no - consensus about - how to interpret -- "``` markdown\nFoo" -- "bar\n---\nbaz\n```" +- "``` markdown" +- "Foo\nbar\n---\nbaz" +- "```" - One can find - four different - "interpretations:" @@ -2733,9 +2776,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "bar\", paragraph" - "\"baz\"" - "2. paragraph \"" -- "Foo bar\"," -- "thematic break," -- "paragraph \"baz\"" +- "Foo bar\"" +- ", thematic break" +- ", paragraph \"baz" +- "\"" - "3. paragraph \"" - "Foo bar --- baz\"" - "4. heading \"Foo" @@ -2787,8 +2831,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that cannot - "count as a [" - setext heading -- "underline], such" -- as +- "underline]" +- ", such as" - "````````````````" - "````````````````" - example @@ -2811,19 +2855,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - baz

    - "````````````````" - "````````````````" -- "## Indented code" +- "##" +- Indented code - blocks -- "An [indented" -- "code block](@)" +- An +- "[indented code" +- "block](@)" - is composed of - "one or more\n[" - "indented chunks]" - separated by - "blank lines.\nAn" - "[indented chunk]" -- (@) is a -- sequence of non- -- "blank lines," +- (@) +- is a sequence of +- "non-blank lines," - each preceded by - four or more - spaces of @@ -2835,8 +2881,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "of the lines," - including - "trailing\n[" -- "line endings]," -- minus four +- "line endings]" +- ", minus four" - spaces of - indentation. - An indented code @@ -2882,8 +2928,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - indicating that - material belongs - "to a [list\nitem]" -- "[list items]," -- the list item +- "[list items]" +- ", the list item" - interpretation - takes precedence - ":" @@ -3054,8 +3100,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Trailing spaces - or tabs are - included in the -- "code block's" -- "content:" +- "code block'" +- "s content:" - "````````````````" - "````````````````" - example @@ -3064,17 +3110,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- "## Fenced code" +- "##" +- Fenced code - blocks -- "A [code fence](@" -- ) is a sequence +- A +- "[code fence](@)" +- is a sequence - of at least - three - consecutive - backtick -- "characters (`` `" -- "``) or\ntildes (" -- "`~`" +- characters ( +- "`` ` ``) or" +- "tildes (`~`" - ). - (Tildes and - backticks cannot @@ -3098,8 +3146,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - leading and - trailing - spaces or tabs -- "and called the [" -- "info string](@)" +- and called the +- "[info string](@)" - ". If the [" - "info string]" - comes @@ -3125,10 +3173,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - consists of all - subsequent lines - ", until" -- "a closing [code" -- "fence] of the" -- same type as the -- code block +- "a closing [" +- "code fence]" +- of the same type +- as the code +- block - began with ( - backticks or - "tildes), and" @@ -3222,10 +3271,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - not parsed - as inlines. - The first word -- "of the [info" -- "string] is" -- typically used -- to +- "of the [" +- "info string]" +- is typically +- used to - specify the - language of the - "code sample, and" @@ -3327,11 +3376,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - end of the - document - (or the -- "enclosing [block" -- "quote][block" -- "quotes] or [list" -- "item][list items" -- "]):" +- "enclosing [" +- "block quote][" +- "block quotes] or" +- "[list item][" +- "list items]):" - "````````````````" - "````````````````" - example @@ -3545,8 +3594,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - opening code - fence. - Although this -- "spec doesn't" -- mandate any +- "spec doesn'" +- t mandate any - particular - treatment of - "the info string," @@ -3636,8 +3685,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - Closing code - fences cannot -- "have [info" -- "strings]:" +- "have [" +- "info strings]:" - "````````````````" - "````````````````" - example @@ -3649,8 +3698,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## HTML blocks" -- "An [HTML block](" -- "@) is a group of" +- An +- "[HTML block](@)" +- is a group of - lines that is - treated - as raw HTML (and @@ -3658,26 +3708,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - escaped in HTML - output). - There are seven -- "kinds of [HTML" -- "block], which" -- can be defined -- by their +- "kinds of [" +- "HTML block]" +- ", which can be" +- defined by their - start and end - conditions. - The block begins - with a line that - meets a - "[start condition" -- "](@) (after up" -- to three -- optional spaces -- of indentation). +- "](@)" +- (after up to +- three optional +- spaces of +- indentation). - It ends with the - first subsequent - line that meets - a matching - "[end condition](" -- "@), or the last" +- "@)" +- ", or the last" - line of the - "document, or the" - "last line of\nthe" @@ -3689,51 +3741,56 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "block, if no" - line is - encountered that -- "meets the [end" -- "condition]. If" +- "meets the [" +- "end condition]" +- ". If" - the first line - "meets both the [" - "start condition]" - "and the [end" -- "condition], the" -- block will +- "condition]" +- ", the block will" - contain just - that line. - "1." - "**Start" - "condition:**" - line begins with -- "the string ``, or" -- the end of the -- "line.\\" +- "string `>`" +- ", or the end of" +- "the line.\\" - "**End condition:" -- "** line" -- contains an end -- "tag\n``," -- "``, ``, or `` (case" -- "-insensitive; it" +- "**" +- line contains an +- "end tag\n``" +- ", ``," +- "``, or" +- "``" +- (case- +- insensitive; it - need not match - the start tag). - "2." - "**Start" - "condition:**" - line begins with -- "the string ``." +- "**" +- line contains +- "the string `-->`" +- "." - "3." - "**Start" - "condition:**" @@ -3741,7 +3798,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the string ``." - "4." - "**Start" @@ -3751,9 +3809,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - followed by an - "ASCII letter.\\" - "**End condition:" -- "** line contains" -- "the character `>" -- "`." +- "**" +- line contains +- the character +- "`>`." - "5." - "**Start" - "condition:**" @@ -3761,7 +3820,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the string - "``" - "." - "6." @@ -3774,65 +3834,71 @@ input_file: tests/inputs/markdown/commonmark_spec.md - of the strings ( - case-insensitive - ") `address`," -- "`article`, `" -- "aside`, `base`," -- "`basefont`, `" -- "blockquote`, `" -- "body`,\n`caption`" -- ", `center`, `col" -- "`, `colgroup`, `" -- "dd`, `details`," +- "`article`," +- "`aside`, `base`," +- "`basefont`," +- "`blockquote`," +- "`body`," +- "`caption`," +- "`center`, `col`," +- "`colgroup`, `dd`" +- ", `details`," - "`dialog`,\n`dir`," -- "`div`, `dl`, `dt" -- "`, `fieldset`, `" -- "figcaption`, `" -- "figure`," +- "`div`, `dl`," +- "`dt`, `fieldset`" +- ", `figcaption`," +- "`figure`," - "`footer`, `form`" -- ", `frame`, `" -- "frameset`,\n`h1`," -- "`h2`, `h3`, `h4`" -- ", `h5`, `h6`, `" -- "head`, `header`," -- "`hr`,\n`html`," -- "`iframe`, `" -- "legend`, `li`, `" -- "link`, `main`, `" -- "menu`, `menuitem" -- "`,\n`nav`," +- ", `frame`," +- "`frameset`,\n`h1`" +- ", `h2`, `h3`," +- "`h4`, `h5`, `h6`" +- ", `head`," +- "`header`, `hr`," +- "`html`, `iframe`" +- ", `legend`, `li`" +- ", `link`, `main`" +- ", `menu`," +- "`menuitem`," +- "`nav`," - "`noframes`, `ol`" -- ", `optgroup`, `" -- "option`, `p`, `" -- "param`,\n`search`" -- ", `section`, `" -- "summary`, `table" -- "`, `tbody`, `td`" -- ",\n`tfoot`, `th`," -- "`thead`, `title`" -- ", `tr`, `track`," -- "`ul`" +- ", `optgroup`," +- "`option`, `p`," +- "`param`," +- "`search`," +- "`section`," +- "`summary`," +- "`table`, `tbody`" +- ", `td`,\n`tfoot`," +- "`th`, `thead`," +- "`title`, `tr`," +- "`track`, `ul`" - ", followed" - "by a space, a" - "tab, the end of" - "the line, the" -- "string `>`" -- ", or\nthe string" -- "`/>`.\\" +- "string `>`, or" +- "the string `/>`." +- "\\" - "**End condition:" -- "** line is" -- "followed by a [" -- "blank line]." +- "**" +- line is followed +- "by a [blank line" +- "]." - "7." - "**Start" - "condition:**" - line begins with -- "a complete [open" -- "tag]\n(with any [" -- "tag name] other" -- "than `pre`, `" -- "script`,\n`style`" -- ", or `textarea`)" -- "or a complete [" -- "closing tag]," +- "a complete [" +- "open tag]" +- "(with any [" +- "tag name]" +- "other than `pre`" +- ", `script`," +- "`style`, or" +- "`textarea`" +- ) or a complete +- "[closing tag]," - followed by zero - or more spaces - "and tabs," @@ -3840,25 +3906,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - end of the line. - "\\" - "**End condition:" -- "** line is" -- "followed by a [" -- "blank line]." +- "**" +- line is followed +- "by a [blank line" +- "]." - HTML blocks - continue until - they are closed - by their - "appropriate\n[" -- "end condition]," -- or the last line -- of the document -- "or other [" -- "container\nblock" +- "end condition]" +- ", or the last" +- line of the +- document or +- other +- "[container\nblock" - "](#container-" - blocks) - "." - This means any -- HTML **within an -- "HTML\nblock**" +- HTML +- "**within an HTML" +- block** - that might - otherwise be - recognised as a @@ -3869,14 +3938,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - passed through - "as-is, without" - changing -- "the parser's" -- state. -- "For instance, `<" -- "pre>` within an" -- HTML block -- "started by `<" -- "table>` will not" -- affect +- "the parser'" +- s state. +- "For instance," +- "`
    `"
    +- within an HTML
    +- block started by
    +- "`
    `" +- will not affect - the parser state - ; as the HTML - block was @@ -3913,10 +3982,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - regular parsing - "resumes, with a" - "paragraph," -- "emphasised `" -- "world` and" -- inline and block -- HTML following. +- emphasised +- "`world`" +- and inline and +- block HTML +- following. - "All types of [" - "HTML blocks]" - except type 7 @@ -4060,9 +4130,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - The initial tag -- "doesn't even" -- need to be a -- valid +- "doesn'" +- t even need to +- be a valid - "tag, as long as" - it starts like - "one:" @@ -4136,9 +4206,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "To start an [" -- "HTML block] with" -- a tag that is * -- not* in the +- "HTML block]" +- with a tag that +- is *not* in the - list of block- - level tags in (6 - "), you must put" @@ -4192,20 +4262,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - either block- - level or inline- - "level tags.\nThe" -- "`` tag is a" -- nice example. +- "``" +- tag is a nice +- example. - We can surround - content with -- "`` tags in" -- three different -- ways. +- "``" +- tags in three +- different ways. - "In this case, we" - get a raw - "HTML block," -- "because the `<" -- "del>` tag is on" -- a line by itself -- ":" +- because the +- "``" +- tag is on a line +- "by itself:" - "````````````````" - "````````````````" - example @@ -4218,9 +4289,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - get a raw HTML - block that just - "includes\nthe" -- "`` tag (" -- because it ends -- with the +- "``" +- tag (because it +- ends with the - following blank - line). - So the contents @@ -4236,12 +4307,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Finally, in this" -- "case, the `" -- "` tags are" +- "case, the" +- "``" +- tags are - "interpreted\nas [" -- "raw HTML] *" -- inside* the -- CommonMark +- "raw HTML]" +- "*inside*" +- the CommonMark - paragraph. - (Because - the tag is not @@ -4263,9 +4335,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - designed to - contain literal - "content\n(`pre`," -- "`script`, `style" -- "`, `textarea`)," -- "comments," +- "`script`," +- "`style`," +- "`textarea`" +- "), comments," - processing - "instructions," - and declarations @@ -4375,11 +4448,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - end at the - end of the - document (or the -- "enclosing [block" -- "quote][block" -- "quotes]\nor [" -- "list item][list" -- "items]):" +- "enclosing [" +- "block quote][" +- "block quotes]\nor" +- "[list item][" +- "list items]):" - "````````````````" - "````````````````" - example @@ -4545,9 +4618,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - An HTML block of -- types 1--6 can -- interrupt a -- "paragraph, and" +- types 1-- +- 6 can interrupt +- "a paragraph, and" - need not be - preceded by a - blank line. @@ -4594,19 +4667,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - This rule - differs from -- "John Gruber's" -- original +- "John Gruber'" +- s original - Markdown syntax - "specification," - "which says:" -- "> The only" +- ">" +- The only - restrictions are - that block-level - HTML elements — - "> e.g. `
    `," -- "`
    `, ``, `

    `, etc." -- — must be +- "`

    `," +- "`
    `, `

    `" +- ", etc. — must be" - "separated from\n>" - surrounding - content by blank @@ -4617,20 +4691,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - be indented with - spaces or tabs. - In some ways -- "Gruber's rule is" -- more restrictive -- than the one -- "given\nhere:" -- "- It requires" -- that an HTML -- block be +- "Gruber'" +- s rule is more +- restrictive than +- the one given +- "here:" +- "-" +- It requires that +- an HTML block be - preceded by a - blank line. -- "- It does not" +- "-" +- It does not - allow the start - tag to be - indented. -- "- It requires a" +- "-" +- It requires a - matching end tag - ", which it also" - does not allow @@ -4639,16 +4716,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Most Markdown - implementations - (including some -- "of Gruber's own)" -- do not +- "of Gruber'" +- s own) do not - respect all of - these - restrictions. - There is one - "respect, however" - ", in which" -- "Gruber's rule is" -- more liberal +- "Gruber'" +- s rule is more +- liberal - than the one - "given here," - since it allows @@ -4716,8 +4794,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - content inside - tags as text if - the open tag has -- "the attribute `" -- "markdown=1`" +- the attribute +- "`markdown=1`" - "." - The rule given - above seems a @@ -4798,51 +4876,56 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "inside `

    `"
     - "tags, but as"
     - "described\n[above"
    -- "][HTML blocks],"
    -- raw HTML blocks
    -- "starting with `<"
    -- "pre>`\n*can*"
    +- "][HTML blocks]"
    +- ", raw HTML"
    +- blocks starting
    +- "with `
    `"
    +- "*can*"
     - contain blank
     - lines.
    -- "## Link"
    -- reference
    +- "##"
    +- Link reference
     - definitions
    -- "A [link"
    -- reference
    +- A
    +- "[link reference"
     - "definition](@)"
     - "consists of a ["
    -- "link label],"
    -- optionally
    +- "link label]"
    +- ", optionally"
     - preceded by up
     - to three spaces
     - of
     - "indentation,"
     - followed
    -- "by a colon (`:`)"
    -- ", optional"
    +- "by a colon (`:`"
    +- "), optional"
     - spaces or tabs (
     - including up to
     - "one\n[line ending"
    -- "]), a [link"
    -- "destination],"
    +- "]), a ["
    +- link destination
    +- "],"
     - optional spaces
     - or tabs (
     - including up to
     - "one\n[line ending"
    -- "]), and an"
    +- "]"
    +- "), and an"
     - "optional [link"
    -- "title], which if"
    -- it is present
    -- must be
    +- "title]"
    +- ", which if it is"
    +- present must be
     - separated
    -- "from the [link"
    -- "destination] by"
    -- spaces or tabs.
    +- "from the ["
    +- link destination
    +- "]"
    +- by spaces or
    +- tabs.
     - No further
     - character may
     - occur.
    -- "A [link"
    -- reference
    +- "A ["
    +- link reference
     - "definition]"
     - does not
     - correspond to a
    @@ -4859,8 +4942,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - elsewhere in the
     - "document.  [Link"
     - reference
    -- "definitions] can"
    -- come either
    +- "definitions]"
    +- can come either
     - before or after
     - the links that
     - "use\nthem."
    @@ -5048,8 +5131,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````"
     - As noted in the
     - "section on ["
    -- "Links], matching"
    -- of labels is
    +- "Links]"
    +- ", matching of"
    +- labels is
     - case-insensitive
     - "(see [matches])."
     - "````````````````"
    @@ -5172,8 +5256,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "

    [foo]

    " - "````````````````" - "````````````````" -- "A [link" -- reference +- "A [" +- link reference - "definition]" - cannot interrupt - a paragraph. @@ -5228,8 +5312,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - foo

    - "````````````````" - "````````````````" -- "Several [link" -- reference +- "Several [" +- link reference - "definitions]" - can occur one - "after another," @@ -5256,9 +5340,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url\">baz

    " - "````````````````" - "````````````````" -- "[Link reference" -- "definitions] can" -- occur +- "[" +- Link reference +- "definitions]" +- can occur - inside block - "containers, like" - lists and block @@ -5288,17 +5373,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - interpreted as - other - kinds of blocks -- "forms a [" -- "paragraph](@)." +- forms a +- "[paragraph](@)." - The contents of - the paragraph - are the result - of parsing the -- "paragraph's raw" -- content as +- "paragraph'" +- s raw content as - inlines. -- "The paragraph's" -- raw content +- "The paragraph'" +- s raw content - is formed by - concatenating - the lines and @@ -5401,8 +5486,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that ends with - two or more - spaces will not -- "end with a [hard" -- "line\nbreak]:" +- "end with a [" +- "hard line\nbreak]" +- ":" - "````````````````" - "````````````````" - example @@ -5436,13 +5522,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    aaa

    " - "````````````````" - "````````````````" -- "# Container" -- blocks -- "A [container" -- "block](#" -- container-blocks -- ) is a block -- that has other +- "#" +- Container blocks +- A +- "[container block" +- "](#container-" +- blocks) +- is a block that +- has other - blocks as its - contents. - There are two @@ -5461,7 +5548,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The general - form of the - "definition is:" -- "> If X is a" +- ">" +- If X is a - sequence of - "blocks, then the" - "result of\n>" @@ -5479,15 +5567,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - list item by - explaining - how these can be -- "*generated* from" -- their contents. +- "*generated*" +- from their +- contents. - This should - suffice - to define the - "syntax, although" - it does not give -- a recipe for * -- parsing* +- a recipe for +- "*parsing*" - these - constructions. - (A recipe is @@ -5500,17 +5589,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - parsing-strategy - ).) - "## Block quotes" -- "A [block quote" +- A +- "[block quote" - "marker](@)," - optionally - preceded by up - to three spaces - "of indentation," - consists of (a) -- "the character `>" -- "` together with" -- a following -- space of +- the character +- "`>`" +- together with a +- following space +- of - "indentation, or" - (b) a single - "character `>`" @@ -5521,47 +5612,52 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "rules define [" - "block quotes]:" - "1." -- "**Basic case." -- "** If a string" -- of lines *Ls* +- "**Basic case.**" +- If a string of +- lines *Ls* - constitute a - sequence -- "of blocks *Bs*," -- then the result -- of prepending a -- "[block quote" -- "marker] to the" -- beginning of -- each line in *Ls -- "*\n is a" +- of blocks *Bs* +- ", then the" +- result of +- "prepending a [" +- block quote +- "marker]" +- to the beginning +- of each line in +- "*Ls*\n is a" - "[block quote](#" - block-quotes) - containing *Bs*. -- 2. **Laziness. -- "** If a string" -- of lines *Ls* -- "constitute a [" -- "block\n quote" +- "2." +- "**Laziness.**" +- If a string of +- lines *Ls* +- constitute a +- "[block\n quote" - "](#block-quotes)" -- with contents * -- "Bs*, then the" +- with contents +- "*Bs*" +- ", then the" - result of - deleting - "the initial [" - block quote -- "marker] from one" -- or +- "marker]" +- from one or - more lines in - which the next - character other - than a space or - tab after the -- "[block quote" +- "[" +- block quote - "marker] is [" - paragraph - continuation -- "text] is a block" -- quote with *Bs* +- "text]" +- is a block quote +- with *Bs* - as its content. - "[Paragraph" - continuation @@ -5577,16 +5673,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "3." - "**" - Consecutiveness. -- "** A document" +- "**" +- A document - cannot contain - "two [block" -- "quotes] in a row" -- unless there is -- "a [blank line]" +- "quotes]" +- in a row unless +- "there is a [" +- "blank line]" - between them. - Nothing else -- "counts as a [" -- "block quote](#" +- counts as a +- "[block quote](#" - block-quotes). - Here is a simple - "example:" @@ -5651,7 +5749,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - The Laziness - clause allows us - "to omit the `>`" -- "before\n[" +- " before\n[" - paragraph - continuation - "text]:" @@ -5693,7 +5791,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - block quote - "markers]." - "For example, the" -- "`> ` cannot be" +- "`> `" +- cannot be - omitted in the - second line of - "``` markdown" @@ -5711,9 +5810,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "Similarly, if we" -- "omit the `> ` in" -- the second line -- of +- "omit the `> `" +- in the second +- line of - "``` markdown" - "> - foo\n> - bar" - "```" @@ -5735,9 +5834,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - For the same -- "reason, we can't" -- "omit the `> ` in" -- front of +- "reason, we can'" +- "t omit the `> `" +- in front of - subsequent lines - of an indented - or fenced code @@ -5787,10 +5886,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```markdown" - "> foo" - "> - bar\n```" -- "the `- bar` is" -- indented too far -- "to start a list," -- "and can't" +- "the `- bar`" +- is indented too +- far to start a +- "list, and can't" - be an indented - code block - because indented @@ -5798,7 +5897,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - cannot - interrupt - "paragraphs, so" -- "it is [paragraph" +- "it is [" +- paragraph - continuation - "text]." - A block quote @@ -5851,11 +5951,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "implementations," - including John - "Gruber's" -- "original `" -- "Markdown.pl`," -- will parse this -- example as a -- single block +- original +- "`Markdown.pl`" +- ", will parse" +- this example as +- a single block - quote - with two - paragraphs. @@ -5966,8 +6066,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the Laziness - rule that any - number -- "of initial `>`s" -- may be omitted +- "of initial `>`" +- s may be omitted - on a - continuation - line of a @@ -6006,10 +6106,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - block in a block - "quote," - remember that -- "the [block quote" +- "the [" +- block quote - "marker] includes" -- "both the `>` and" -- a following +- "both the `>`" +- and a following - space of - indentation. So - "*five spaces*" @@ -6030,25 +6131,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "## List items" -- "A [list marker](" -- "@) is a\n[" +- A +- "[list marker](@)" +- " is a\n[" - bullet list - "marker] or an [" - ordered list - "marker]." -- "A [bullet list" +- A +- "[bullet list" - "marker](@)\nis a" - "`-`, `+`, or `*`" - character. -- "An [ordered list" +- An +- "[ordered list" - "marker](@)" - is a sequence of -- 1--9 arabic -- "digits (`0-9`)," -- followed by +- 1-- +- 9 arabic digits +- "(`0-9`" +- "), followed by" - "either a\n`.`" -- "character or a `" -- ")`" +- character or a +- "`)`" - character. - (The reason for - the length @@ -6063,9 +6168,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "rules define [" - "list items]:" - "1." -- "**Basic case." -- "** If a" -- sequence of +- "**Basic case.**" +- If a sequence of - lines *Ls* - constitute a - sequence of @@ -6073,26 +6177,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - starting with a - character other - than a space or -- "tab, and *M*" -- is +- "tab, and *M* is" - a list marker of - width *W* - followed by 1 ≤ -- "*N* ≤ 4 spaces" -- "of indentation," +- "*N*" +- ≤ 4 spaces of +- "indentation," - then the result -- of prepending *M -- "* and the" +- of prepending +- "*M*" +- and the - following spaces - to the first - "line\n of *Ls*" - ", and indenting" - subsequent lines -- of *Ls* by *W + -- "N* spaces, is a" -- list item with * -- Bs* as its -- contents. +- of *Ls* by +- "*W + N*" +- "spaces, is a" +- list item with +- "*Bs*" +- as its contents. - The type of the - list item - (bullet or @@ -6117,7 +6223,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - starts on a line - that would - otherwise count -- "as [paragraph" +- "as [" +- paragraph - continuation - "text]---then (a)" - the lines *Ls* @@ -6136,8 +6243,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that line is not - a list item. - "For example, let" -- "*Ls* be the" -- lines +- "*Ls*" +- be the lines - "````````````````" - "````````````````" - example @@ -6157,9 +6264,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````" - "````````````````" -- And let *M* be -- "the marker `1.`," -- and *N* +- And let *M* +- be the marker +- "`1.`, and *N*" - "= 2." - "Then rule #1" - says @@ -6321,8 +6428,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - occurs in the - same column as - the list marker -- "`1.`" -- "," +- "`1.`," - but is actually - contained in the - "list item," @@ -6341,8 +6447,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - occurs far to - the right of the - initial text of -- "the list item, `" -- "one`, but" +- "the list item," +- "`one`, but" - it is not - considered part - of the list item @@ -6512,18 +6618,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - one space of - "indentation," - then the result -- of prepending *M -- "* and the" +- of prepending +- "*M* and the" - following space - to the first -- "line of *Ls*," -- and indenting +- line of *Ls* +- ", and indenting" - subsequent lines -- of *Ls* by *W + -- "1* spaces, is a" -- list item with * -- Bs* as its -- contents. +- of *Ls* by +- "*W + 1*" +- "spaces, is a" +- list item with +- "*Bs*" +- as its contents. - If a line is - "empty, then it" - need not be @@ -6590,8 +6697,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "then by rule #2," - the contents - must be preceded -- by *one* space -- of indentation +- by *one* +- space of +- indentation - after the list - "marker:" - "````````````````" @@ -6735,30 +6843,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - If a sequence of - lines *Ls* - starting with a -- "single [blank" -- "line] constitute" -- a (possibly -- empty) +- "single [" +- "blank line]" +- constitute a ( +- possibly empty) - sequence of - "blocks *Bs*, and" -- "*M* is a list" -- marker of width -- "*W*" -- "," +- "*M*" +- is a list marker +- "of width *W*," - then the result -- of prepending *M -- "* to the first" -- "line of *Ls*," -- and +- of prepending +- "*M*" +- to the first +- line of *Ls* +- ", and" - preceding - subsequent lines -- of *Ls* by *W + -- 1* spaces of +- of *Ls* by +- "*W + 1*" +- spaces of - "indentation, is" - a -- list item with * -- Bs* as its -- contents. +- list item with +- "*Bs*" +- as its contents. - If a line is - "empty, then it" - need not be @@ -6807,8 +6916,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - spaces - following the - list marker -- "doesn't change" -- the required +- "doesn'" +- t change the +- required - "indentation:" - "````````````````" - "````````````````" @@ -7016,14 +7126,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    " - "````````````````" - "````````````````" -- 5. **Laziness. -- "** If a string" -- of lines *Ls* -- "constitute a [" -- "list\n item" +- "5." +- "**Laziness.**" +- If a string of +- lines *Ls* +- constitute a +- "[list\n item" - "](#list-items)" -- with contents * -- "Bs*, then the" +- with contents +- "*Bs*" +- ", then the" - result of - deleting - some or all of @@ -7036,7 +7148,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - space or tab - after the - indentation is -- "[paragraph" +- "[" +- paragraph - continuation - "text] is a" - list item with @@ -7133,9 +7246,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - not counted as a - list item by - "rules\n #1--" -- "5 counts as a [" -- "list item](#list" -- "-items)." +- 5 counts as a +- "[list item](#" +- list-items). - The rules for - sublists follow - from the general @@ -7260,13 +7373,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - "### Motivation" -- "John Gruber's" -- Markdown spec +- "John Gruber'" +- s Markdown spec - says the - following about - "list items:" - "1." -- "\"List markers" +- "\"" +- List markers - typically start - at the left - "margin, but may" @@ -7279,16 +7393,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - spaces or a tab. - "\"" - "2." -- "\"To make lists" +- "\"" +- To make lists - "look nice, you" - can wrap items - with hanging - indents.... -- "But if you don't" -- "want to, you" +- "But if you don'" +- "t want to, you" - "don't have to.\"" - "3." -- "\"List items may" +- "\"" +- List items may - consist of - multiple - paragraphs. @@ -7299,17 +7415,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - either 4 spaces - "or one\n tab.\"" - "4." -- "\"It looks nice" -- if you indent -- every line of -- the subsequent +- "\"" +- It looks nice if +- you indent every +- line of the +- subsequent - "paragraphs," - "but here again," - Markdown will - allow you to be - "lazy.\"" - "5." -- "\"To put a" +- "\"" +- To put a - blockquote - within a list - "item, the" @@ -7317,7 +7435,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - delimiters need - "to be indented.\"" - "6." -- "\"To put a code" +- "\"" +- To put a code - block within a - "list item, the" - code block needs @@ -7378,15 +7497,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "and principled," - and if the - reference -- "implementation `" -- "Markdown.pl` had" -- "followed it, it" -- probably would -- have +- implementation +- "`Markdown.pl`" +- "had followed it," +- it probably +- would have - become the - standard. -- "However, `" -- "Markdown.pl`" +- "However," +- "`Markdown.pl`" - allowed - paragraphs and - sublists to @@ -7422,7 +7541,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "python-Markdown," - "for example," - stuck with -- "Gruber's syntax" +- "Gruber'" +- s syntax - description and - the four-space - "rule, while" @@ -7431,8 +7551,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "marked, PHP" - "Markdown, and" - "others\nfollowed" -- "`Markdown.pl`'s" -- behavior more +- "`Markdown.pl`'" +- s behavior more - closely.) - "Unfortunately," - given the @@ -7457,8 +7577,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule - or - the more -- "forgiving `" -- "Markdown.pl`" +- forgiving +- "`Markdown.pl`" - "behavior," - provided they - are laid out @@ -7520,7 +7640,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - with an - intervening - "paragraph," -- "``` html\n
      " +- "``` html" +- "
        " - "
      • foo
      • " - "
      \n

      bar

      " - "
        " @@ -7531,8 +7652,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "demands, rather" - than a single - "list," -- "``` html\n
          " -- "
        • \n

          foo

          " +- "``` html" +- "
            \n
          • " +- "

            foo

            " - "

            bar

            \n
              " - "
            • baz
            • " - "
            \n
          • " @@ -7561,8 +7683,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the initial list - "marker, allows" - text that is -- indented *less -- than* the +- indented +- "*less than* the" - original list - marker to be - included in the @@ -7577,8 +7699,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "item, with `two`" - a continuation - "paragraph:" -- "``` html\n
              " -- "
            • \n

              one

              " +- "``` html" +- "
                \n
              • " +- "

                one

                " - "

                two

                \n
              • " - "
              \n```" - and similarly @@ -7632,7 +7755,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Arguably this - text does read - like a list item -- "with `bar` as a" +- "with `bar`" +- as a - "subparagraph," - which may count - in favor of the @@ -7662,18 +7786,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - parse this text - "as expected," - since the code -- "block's" -- indentation is +- "block'" +- s indentation is - measured - from the -- "beginning of `" -- "foo`." +- beginning of +- "`foo`." - The one case - that needs - special - treatment is a -- list item that * -- starts* +- list item that +- "*starts*" - with indented - code. - How much @@ -7682,8 +7806,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "case, since" - "we don't have a" - "\"first paragraph" -- "\" to measure" -- from? +- "\"" +- to measure from? - "Rule #2 simply" - stipulates - that in such @@ -7709,23 +7833,26 @@ input_file: tests/inputs/markdown/commonmark_spec.md - diverge in other - cases. - "## Lists" -- "A [list](@) is a" -- sequence of one -- or more -- "list items [of" -- "the same type]" +- "A [list](@)" +- is a sequence of +- one or more +- "list items [" +- of the same type +- "]" - "." - The list items - may be separated - by any number of - blank lines. - Two list items -- "are [of the same" +- are +- "[of the same" - "type](@)" - if they begin -- "with a [list" -- "marker] of the" -- same type. +- "with a [" +- "list marker]" +- of the same type +- "." - Two list markers - are of the - same type if (a) @@ -7733,30 +7860,34 @@ input_file: tests/inputs/markdown/commonmark_spec.md - list markers - using the same - "character\n(`-`," -- "`+`, or `*`) or" -- (b) they are -- ordered list +- "`+`, or `*`" +- ) or (b) they +- are ordered list - numbers with the - same - delimiter ( -- "either `.` or `)" -- "`)." -- "A list is an [" -- "ordered list](@)" +- "either `.` or" +- "`)`)." +- A list is an +- "[ordered list](@" +- ) - if its - constituent list - items begin with -- "[ordered list" +- "[" +- ordered list - "markers], and a" - "[bullet list](@)" - if its - constituent list - items begin with -- "[bullet list" +- "[" +- bullet list - "markers]." -- "The [start" -- "number](@)\nof an" -- "[ordered list]" +- The +- "[start number](@" +- ")\nof an [" +- "ordered list]" - is determined by - the list number - of @@ -7766,9 +7897,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - subsequent list - items are - disregarded. -- "A list is [loose" -- "](@) if any of" -- its constituent +- A list is +- "[loose](@)" +- if any of its +- constituent - list items are - separated by - "blank lines, or" @@ -7781,8 +7913,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - blank line - between them. - Otherwise a list -- "is [tight](@)" -- "." +- "is [tight](@)." - (The difference - in HTML output - is that @@ -7859,9 +7990,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "house is\n14." - The number of - "doors is 6.\n```" -- "Oddly, though, `" -- "Markdown.pl` *" -- does* allow a +- "Oddly, though," +- "`Markdown.pl`" +- "*does*" +- allow a - blockquote to - interrupt a - "paragraph, even" @@ -7889,7 +8021,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```" - "Second, we are" - attracted to a -- "> [principle of" +- ">" +- "[principle of" - "uniformity](@):" - ">" - if a chunk of @@ -7899,14 +8032,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - continue to have - the same meaning - when put into a -- "> container" -- block (such as a -- list item or +- ">" +- container block +- (such as a list +- item or - blockquote). - "(Indeed, the" -- "spec for [list" -- "items] and [" -- "block quotes]" +- "spec for [" +- "list items] and" +- "[block quotes]" - presupposes - this principle.) - This principle @@ -7930,8 +8064,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - may be rendered - "without `

              `" - "tags, since the" -- "list is \"tight\")" -- ",\nthen" +- "list is \"tight\"" +- "),\nthen" - "``` markdown" - I need to buy - "- new shoes" @@ -7957,7 +8091,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - allow this - outside list - items as -- "well. ([" +- well. ( +- "[" - reStructuredText - "](https://" - docutils.sourcef @@ -7981,8 +8116,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - hard-wrapped - "numerals, we" - allow only lists -- "starting with `1" -- "` to" +- starting with +- "`1` to" - interrupt - paragraphs. - "Thus," @@ -8153,8 +8288,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - than - three spaces of - indentation. -- "Here `- e` is" -- treated as a +- "Here `- e`" +- is treated as a - paragraph - continuation - "line, because it" @@ -8436,20 +8571,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code>lo`

              " - "````````````````" - "````````````````" -- "`hi` is parsed" -- "as code, leaving" +- "`hi`" +- is parsed as +- "code, leaving" - the backtick at - the end as a - literal - backtick. - "## Code spans" -- "A [backtick" -- "string](@)" +- A +- "[backtick string" +- "](@)" - is a string of - one or more - backtick -- "characters (`` `" -- "``) that is" +- characters ( +- "`` ` ``" +- ) that is - neither - preceded nor - followed by a @@ -8471,14 +8609,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - normalized in - the - "following ways:" -- "- First, [line" -- "endings] are" -- "converted to [" -- "spaces]." -- "- If the" -- resulting string -- both begins *and -- "* ends with a [" +- "- First, [" +- "line endings]" +- are converted to +- "[spaces]." +- "-" +- If the resulting +- string both +- begins *and* +- "ends with a [" - "space]" - "character, but" - does not consist @@ -8546,8 +8685,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - code>

              - "````````````````" - "````````````````" -- Note that only * -- one* space is +- Note that only +- "*one*" +- space is - "stripped:" - "````````````````" - "````````````````" @@ -8571,10 +8711,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - code>

              - "````````````````" - "````````````````" -- "Only [spaces]," -- "and not [unicode" -- "whitespace] in" -- "general, are" +- "Only [spaces]" +- ", and not [" +- unicode +- "whitespace]" +- "in general, are" - stripped in this - "way:" - "````````````````" @@ -8639,10 +8780,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - collapse - consecutive - spaces -- "when rendering `" -- "` elements" -- ", so it is" -- recommended that +- when rendering +- "``" +- "elements, so it" +- is recommended +- that - the following - "CSS be used:" - "code{white-space" @@ -8708,7 +8850,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - not parsed as - "emphasized text," - since the second -- "`*` is part of a" +- "`*`" +- is part of a - "code\nspan:" - "````````````````" - "````````````````" @@ -8823,34 +8966,38 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

              " - "````````````````" - "````````````````" -- "## Emphasis and" +- "##" +- Emphasis and - strong emphasis -- "John Gruber's" -- "original [" -- Markdown syntax -- "description](" -- "https://" +- "John Gruber'" +- s original +- "[Markdown syntax" +- description +- "](https://" - daringfireball.n - et/projects/ - "markdown/syntax#" - "em) says:" -- "> Markdown" -- treats asterisks -- "(`*`) and" +- ">" +- Markdown treats +- "asterisks (`*`" +- ) and - "underscores (`_`" - ) as indicators - "of\n>" - emphasis. - Text wrapped - "with one `*` or" -- "`_` will be" -- wrapped with an -- "HTML\n> ``" +- "`_`" +- will be wrapped +- "with an HTML\n>" +- "``" - "tag; double `*`'" -- "s or `_`'s will" -- be wrapped with -- "an HTML ``\n> tag." +- "s or `_`'" +- s will be +- wrapped with an +- "HTML ``" +- "> tag." - This is enough - "for most users," - but these rules @@ -8864,8 +9011,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - test suite makes - it clear that - "triple `***` and" -- "`___` delimiters" -- can be used for +- "`___`" +- delimiters can +- be used for - "strong emphasis," - and most - implementations @@ -8907,8 +9055,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - restricted - intraword - "emphasis to\nthe" -- "`*` forms, to" -- avoid unwanted +- "`*`" +- "forms, to avoid" +- unwanted - emphasis in - words containing - internal @@ -8948,15 +9097,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "escaped `*`" - "character, or a" - sequence -- "of one or more `" -- "_` characters" -- that is not -- preceded or -- followed by +- of one or more +- "`_`" +- characters that +- is not preceded +- or followed by - a non-backslash- - "escaped `_`" - character. -- "A [left-flanking" +- A +- "[left-flanking" - "delimiter run](@" - ") is\na [" - "delimiter run]" @@ -8966,17 +9116,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "whitespace]," - and either (2a) - not followed by -- "a [Unicode" +- "a [" +- Unicode - punctuation - "character], or" - (2b) followed by -- "a [Unicode" +- "a [" +- Unicode - punctuation - "character] and" - "preceded by [" - Unicode - "whitespace] or a" -- "[Unicode" +- "[" +- Unicode - punctuation - "character]." - For purposes of @@ -8986,8 +9139,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the line count - as Unicode - whitespace. -- "A [right-" -- flanking +- A +- "[right-flanking" - "delimiter run](@" - ") is\na [" - "delimiter run]" @@ -8997,17 +9150,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "whitespace]," - and either (2a) - not preceded by -- "a [Unicode" +- "a [" +- Unicode - punctuation - "character], or" - (2b) preceded by -- "a [Unicode" +- "a [" +- Unicode - punctuation - "character] and" - "followed by [" - Unicode - "whitespace] or a" -- "[Unicode" +- "[" +- Unicode - punctuation - "character]." - For purposes of @@ -9057,8 +9213,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - and the - character - after comes from -- Roopesh -- "Chander's" +- "Roopesh Chander'" +- s - "[vfmd](https://" - web.archive.org/ - web/ @@ -9091,28 +9247,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "strong emphasis:" - "1." - "A single `*`" -- "character [can" -- "open emphasis](@" -- ) +- character +- "[can open" +- "emphasis](@)" - iff (if and only - if) it is part -- "of a [left-" -- flanking +- "of a [" +- left-flanking - "delimiter run]." - "2." - "A single `_`" -- "character [can" -- "open emphasis]" -- iff +- "character [" +- can open +- "emphasis] iff" - it is part of a -- "[left-flanking" +- "[" +- left-flanking - "delimiter run]" - and either (a) - "not part of a [" - right-flanking - "delimiter run]" - or (b) part of a -- "[right-flanking" +- "[" +- right-flanking - "delimiter run]" - "preceded by a [" - Unicode @@ -9120,79 +9278,86 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "character]." - "3." - "A single `*`" -- "character [can" -- "close emphasis](" -- "@)" +- character +- "[can close" +- "emphasis](@)" - iff it is part -- "of a [right-" -- flanking +- "of a [" +- right-flanking - "delimiter run]." - "4." - "A single `_`" -- "character [can" -- "close emphasis]" -- iff +- "character [" +- can close +- "emphasis] iff" - it is part of a -- "[right-flanking" +- "[" +- right-flanking - "delimiter run]" - and either (a) - "not part of a [" - left-flanking - "delimiter run]" - or (b) part of a -- "[left-flanking" +- "[" +- left-flanking - "delimiter run]" - "followed by a [" - Unicode - punctuation - "character]." - "5." -- "A double `**` [" -- can open strong +- "A double `**`" +- "[can open strong" - "emphasis](@)" - iff it is part -- "of a [left-" -- flanking +- "of a [" +- left-flanking - "delimiter run]." - "6." - "A double `__` [" - can open strong - "emphasis] iff" - it is part of a -- "[left-flanking" +- "[" +- left-flanking - "delimiter run]" - and either (a) - "not part of a [" - right-flanking - "delimiter run]" - or (b) part of a -- "[right-flanking" +- "[" +- right-flanking - "delimiter run]" - "preceded by a [" - Unicode - punctuation - "character]." - "7." -- "A double `**` [" -- can close strong -- "emphasis](@)" +- "A double `**`" +- "[can close" +- "strong emphasis]" +- (@) - iff it is part -- "of a [right-" -- flanking +- "of a [" +- right-flanking - "delimiter run]." - "8." - "A double `__` [" - can close strong - "emphasis] iff" - it is part of a -- "[right-flanking" +- "[" +- right-flanking - "delimiter run]" - and either (a) - "not part of a [" - left-flanking - "delimiter run]" - or (b) part of a -- "[left-flanking" +- "[" +- left-flanking - "delimiter run]" - "followed by a [" - Unicode @@ -9201,17 +9366,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "9." - Emphasis begins - with a delimiter -- "that [can open" -- "emphasis] and" -- ends +- "that [" +- can open +- "emphasis]" +- and ends - with a delimiter -- "that [can close" -- "emphasis], and" -- that uses the -- same +- "that [" +- can close +- "emphasis]" +- ", and that uses" +- the same - "character (`_`" -- "or `*`) as the" -- opening +- "or `*`" +- ) as the opening - delimiter. The - opening and - closing @@ -9241,17 +9408,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Strong emphasis - begins with a - delimiter that -- "[can open strong" -- "emphasis] and" -- ends with a +- "[" +- can open strong +- "emphasis]" +- and ends with a - delimiter that -- "[can close" -- "strong emphasis]" +- "[" +- can close strong +- "emphasis]" - ", and that uses" - the same - "character\n (" -- "`_` or `*`) as" -- the opening +- "`_` or `*`" +- ) as the opening - delimiter. The - opening and - closing @@ -9320,22 +9489,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Thus, for" - "example," - an -- "interpretation `" -- "...` is" -- always preferred -- to +- interpretation +- "`...`" +- is always +- preferred to - "`...`." - "14." - An -- "interpretation `" -- "...<" -- "/strong>`" +- interpretation +- "`..." +- "`" - is always -- "preferred to `<" -- strong>...`." +- preferred to +- "`..." +- "`." - "15." - When two - potential @@ -9353,11 +9522,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Thus, for" - "example," - "`*foo _bar* baz_" -- "` is parsed as `" -- "foo _bar baz_` rather" -- "than `*foo " -- "bar* baz`." +- "` is parsed as" +- "`foo _bar baz_` rather" +- than +- "`*foo bar*" +- "baz`." - "16." - When there are - two potential @@ -9379,9 +9549,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`**foo " - bar baz - "`" -- "rather than `<" -- strong>foo **bar -- "baz`." +- rather than +- "`foo **" +- bar baz +- "`." - "17." - Inline code - "spans, links," @@ -9401,12 +9572,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - former always - wins. - "Thus, for" -- "example, `*[foo*" -- "](bar)` is" -- "parsed as `*foo*<" -- "/a>` rather than" -- as +- "example," +- "`*[foo*](bar)`" +- is +- parsed as +- "`*" +- "foo*`" +- rather than as - "`[foo](" - "bar)`." - These rules can @@ -9425,12 +9597,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "opening `*` is" -- followed by +- "opening `*`" +- is followed by - "whitespace, and" - hence not part -- "of a [left-" -- flanking +- "of a [" +- left-flanking - "delimiter run]:" - "````````````````" - "````````````````" @@ -9443,8 +9615,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "opening `*` is" -- preceded +- "opening `*`" +- is preceded - by an - alphanumeric and - followed by @@ -9489,8 +9661,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - Intraword -- "emphasis with `*" -- "` is permitted:" +- emphasis with +- "`*`" +- "is permitted:" - "````````````````" - "````````````````" - example @@ -9519,8 +9692,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "opening `_` is" -- followed by +- "opening `_`" +- is followed by - "whitespace:" - "````````````````" - "````````````````" @@ -9533,8 +9706,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "opening `_` is" -- preceded +- "opening `_`" +- is preceded - by an - alphanumeric and - followed by @@ -9547,8 +9720,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - quot;_

              - "````````````````" - "````````````````" -- "Emphasis with `_" -- "` is not allowed" +- Emphasis with +- "`_`" +- is not allowed - "inside words:" - "````````````````" - "````````````````" @@ -9574,8 +9748,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - тся_

              - "````````````````" - "````````````````" -- "Here `_` does" -- not generate +- "Here `_`" +- does not +- generate - "emphasis," - because the - first delimiter @@ -9628,8 +9803,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "closing `*` is" -- preceded by +- "closing `*`" +- is preceded by - "whitespace:" - "````````````````" - "````````````````" @@ -9653,15 +9828,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "second `*`" -- is +- "second `*` is" - preceded by - punctuation and - followed by an - alphanumeric - (hence it is not -- "part of a [right" -- "-flanking" +- "part of a [" +- right-flanking - "delimiter run]:" - "````````````````" - "````````````````" @@ -9685,8 +9859,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - Intraword -- "emphasis with `*" -- "` is allowed:" +- emphasis with +- "`*` is allowed:" - "````````````````" - "````````````````" - example @@ -9699,8 +9873,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "closing `_` is" -- preceded by +- "closing `_`" +- is preceded by - "whitespace:" - "````````````````" - "````````````````" @@ -9713,8 +9887,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "emphasis," - because the -- "second `_`" -- is +- "second `_` is" - preceded by - punctuation and - followed by an @@ -9738,8 +9911,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - Intraword - emphasis is -- "disallowed for `" -- "_`:" +- disallowed for +- "`_`:" - "````````````````" - "````````````````" - example @@ -9810,8 +9983,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "strong emphasis," - because the -- "opening `**` is" -- preceded +- "opening `**`" +- is preceded - by an - alphanumeric and - followed by @@ -9829,8 +10002,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - Intraword strong -- "emphasis with `*" -- "*` is permitted:" +- emphasis with +- "`**`" +- "is permitted:" - "````````````````" - "````````````````" - example @@ -9877,8 +10051,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "strong emphasis," - because the -- "opening `__` is" -- preceded +- "opening `__`" +- is preceded - by an - alphanumeric and - followed by @@ -9893,8 +10067,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - Intraword strong - emphasis is -- "forbidden with `" -- "__`:" +- forbidden with +- "`__`:" - "````````````````" - "````````````````" - example @@ -9968,15 +10142,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - (Nor can it be - interpreted as -- "an emphasized `*" -- "foo bar *`," -- because of +- an emphasized +- "`*foo bar *`" +- ", because of" - Rule 11.) - This is not - "strong emphasis," - because the -- "second `**`" -- is +- "second `**` is" - preceded by - punctuation and - followed by an @@ -10062,8 +10235,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This is not - "strong emphasis," - because the -- "second `__`" -- is +- "second `__` is" - preceded by - punctuation and - followed by an @@ -10092,8 +10264,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - Intraword strong - emphasis is -- "forbidden with `" -- "__`:" +- forbidden with +- "`__`:" - "````````````````" - "````````````````" - example @@ -10244,8 +10416,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - that - can both open - and close (like -- "the `*` after `" -- "foo`)" +- "the `*` after" +- "`foo`)" - cannot form - emphasis if the - sum of the @@ -10260,8 +10432,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - both lengths are - multiples of 3. - For the same -- "reason, we don't" -- get two +- "reason, we don'" +- t get two - consecutive - emphasis - sections in this @@ -11015,10 +11187,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "## Links" - A link contains -- "[link text] (the" -- "visible text), a" -- "[link" -- "destination]" +- "[link text]" +- (the visible +- "text), a [" +- link destination +- "]" - (the URI that is - the link - "destination)," @@ -11052,7 +11225,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ). The - following rules - "apply:" -- "- Links may not" +- "-" +- Links may not - contain other - "links, at any" - level of nesting @@ -11065,26 +11239,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "other, the inner" - "-most definition" - is used. -- "- Brackets are" +- "-" +- Brackets are - "allowed in the [" -- "link text] only" -- if (a) they +- "link text]" +- only if (a) they - are backslash- - escaped or (b) - they appear as a - matched pair of - "brackets," - with an open -- "bracket `[`, a" -- sequence of zero -- "or more inlines," -- and +- "bracket `[`" +- ", a sequence of" +- zero or more +- "inlines, and" - a close bracket - "`]`." -- "- Backtick [code" -- "spans], [" -- "autolinks], and" -- "raw [HTML tags]" +- "-" +- "Backtick [" +- "code spans], [" +- "autolinks]" +- ", and raw [" +- "HTML tags]" - bind more - tightly - than the @@ -11098,41 +11275,46 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "the second `]`" - is part of a - code span. -- "- The brackets" -- in link text -- bind more -- tightly than -- "markers for\n [" +- "-" +- The brackets in +- link text bind +- more tightly +- than markers for +- "[" - emphasis and - "strong emphasis]" - "." - "Thus, for" -- "example, `*[foo*" -- "](url)` is a" -- link. -- "A [link" +- "example," +- "`*[foo*](url)`" +- is a link. +- A +- "[link" - "destination](@)" - consists of - either -- "- a sequence of" +- "-" +- a sequence of - zero or more - characters - between an -- "opening `<` and" -- "a\n closing `>`" +- "opening `<`" +- " and a\n closing" +- "`>`" - that contains no - line endings or - "unescaped\n `<`" - "or `>`" - "characters, or" -- "- a nonempty" +- "-" +- a nonempty - sequence of - characters that - does not start -- "with `<`" -- "," +- "with `<`," - does not include -- "[ASCII control" +- "[" +- ASCII control - "characters][" - ASCII control - "character]\n or" @@ -11161,101 +11343,112 @@ input_file: tests/inputs/markdown/commonmark_spec.md - nesting - should be - supported.) -- "A [link title](@" -- ) consists of +- A +- "[link title](@)" +- consists of - either -- "- a sequence of" +- "-" +- a sequence of - zero or more - characters - between straight - double-quote -- "characters (`\"`)" -- ", including a `\"" -- "` character only" +- "characters (`\"`" +- "), including a" +- "`\"`" +- character only - if it is - backslash- - "escaped, or" -- "- a sequence of" +- "-" +- a sequence of - zero or more - characters - between straight - single-quote -- "characters (`'`)" -- ", including a `'" -- "` character only" +- "characters (`'`" +- "), including a" +- "`'`" +- character only - if it is - backslash- - "escaped, or" -- "- a sequence of" +- "-" +- a sequence of - zero or more - characters - between matching - "parentheses\n (" -- "`(...)`)," -- "including a `(`" -- "or `)` character" -- only if it is +- "`(...)`" +- "), including a" +- "`(` or `)`" +- character only +- if it is - backslash- - escaped. -- "Although [link" -- "titles] may span" +- "Although [" +- "link titles]" +- may span - "multiple lines," - they may not - "contain\na [" - "blank line]." -- "An [inline link]" -- (@) consists of -- "a [link text]" +- An +- "[inline link](@)" +- "consists of a [" +- "link text]" - followed - immediately - by a left -- "parenthesis `(`," -- "an optional [" +- "parenthesis `(`" +- ", an optional [" - link destination - "], an optional\n[" -- "link title], and" -- a right -- "parenthesis `)`" -- "." +- "link title]" +- ", and a right" +- "parenthesis `)`." - These four - components may - be separated by - "spaces, tabs," - and up to one - "line\nending." -- "If both [link" -- "destination] and" -- "[link title] are" -- "present, they *" -- must* be +- "If both [" +- link destination +- "] and [" +- "link title]" +- "are present," +- they *must* be - separated by - "spaces, tabs," - and up to one - line ending. -- "The link's text" -- consists of the -- inlines +- "The link'" +- s text consists +- of the inlines - "contained\nin the" -- "[link text] (" -- excluding the +- "[link text]" +- (excluding the - enclosing square - brackets). -- "The link's URI" -- consists of the -- link destination -- ", excluding" +- "The link'" +- s URI consists +- of the link +- "destination," +- excluding - enclosing -- "`<...>` if" -- "present, with" +- "`<...>`" +- "if present, with" - backslash- - escapes in - effect as - described - above. The link -- "'s title" -- consists of the -- "link title," -- excluding its +- "'" +- s title consists +- of the link +- "title, excluding" +- its - enclosing - "delimiters, with" - backslash- @@ -11656,11 +11849,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "spaces, tabs," - and up to one - "line\nending." -- "Other [Unicode" -- "whitespace] like" -- non-breaking -- "space doesn't" -- work. +- "Other [" +- Unicode +- "whitespace]" +- like non- +- breaking space +- "doesn't work." - "````````````````" - "````````````````" - example @@ -11705,9 +11899,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /p> - "````````````````" - "````````````````" -- "(Note: `" -- "Markdown.pl` did" -- allow double +- "(Note:" +- "`Markdown.pl`" +- did allow double - quotes inside a - double-quoted - "title, and its" @@ -11723,7 +11917,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "brings, since" - there are - already many -- ways---backslash +- ways--- +- backslash - "escaping," - entity and - numeric @@ -11733,12 +11928,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - different - quote type for - the enclosing -- title---to write -- titles +- title--- +- to write titles - containing - double quotes. -- "`Markdown.pl`'s" -- handling of +- "`Markdown.pl`'" +- s handling of - titles has a - number - of other strange @@ -11754,9 +11949,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - but not inline - "links, it allows" - a title to begin -- "with `\"` and end" -- "with `)`." -- "`Markdown.pl`" +- "with `\"`" +- "and end with `)`" +- ". `Markdown.pl`" - 1.0.1 even - allows - titles with no @@ -11943,10 +12138,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - Note that -- brackets that * -- "aren't* part of" -- links do not -- "take\nprecedence:" +- brackets that +- "*aren't*" +- part of links do +- not take +- "precedence:" - "````````````````" - "````````````````" - example @@ -11999,21 +12195,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - There are three -- "kinds of [" -- "reference link](" -- "@)s:" +- kinds of +- "[reference link]" +- "(@)s:" - "[full](#full-" - "reference-link)," - "[collapsed](#" - collapsed- - "reference-link)," -- "and [shortcut](#" +- and +- "[shortcut](#" - shortcut- - reference-link). -- "A [full" -- "reference link](" -- "@)\nconsists of a" -- "[link text]" +- A +- "[full reference" +- "link](@)" +- "consists of a [" +- "link text]" - immediately - "followed by a [" - "link label]\nthat" @@ -12022,13 +12220,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "definition]" - elsewhere in the - document. -- "A [link label](@" -- ) begins with a -- "left bracket (`[" -- "`) and ends" +- A +- "[link label](@)" +- begins with a +- left bracket ( +- "`[`) and ends" - with the first -- "right bracket (`" -- "]`) that is not" +- right bracket ( +- "`]`" +- ) that is not - backslash- - escaped. - Between these @@ -12054,8 +12254,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - characters - inside the - "square\nbrackets." -- "One label [" -- "matches](@)" +- One label +- "[matches](@)" - another just in - case their - normalized forms @@ -12066,9 +12266,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - closing brackets - ",\nperform the" - "*Unicode case" -- "fold*, strip" -- leading and -- trailing +- fold* +- ", strip leading" +- and trailing - "spaces, tabs," - and line endings - ", and collapse" @@ -12090,10 +12290,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - (It is desirable - in such cases to - emit a warning.) -- "The link's URI" -- and title are -- provided by the -- "matching [link" +- "The link'" +- s URI and title +- are provided by +- "the matching [" +- link - reference - "definition]." - Here is a simple @@ -12206,7 +12407,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - (In the examples - "above, we have" -- "two [shortcut" +- "two [" +- shortcut - "reference links]" - "instead of one [" - full reference @@ -12330,9 +12532,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - or line endings - are allowed - "between the [" -- "link text] and" -- "the\n[link label]" -- ":" +- "link text]" +- " and the\n[" +- "link label]:" - "````````````````" - "````````````````" - example @@ -12359,8 +12561,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - This is a - departure from -- "John Gruber's" -- original +- "John Gruber'" +- s original - Markdown syntax - "description," - which explicitly @@ -12372,9 +12574,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - It brings - reference links - "in line with\n[" -- "inline links]," -- which (according -- to both original +- "inline links]" +- ", which (" +- according to +- both original - Markdown and - this spec) - cannot have @@ -12413,8 +12616,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - by Gruber - himself in a - beta version of -- "`Markdown.pl`," -- but never +- "`Markdown.pl`" +- ", but never" - included - in the official - syntax @@ -12439,8 +12642,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - results.) - When there are - multiple -- "matching [link" -- reference +- "matching [" +- link reference - "definitions]," - the first is - "used:" @@ -12562,25 +12765,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

              [\n]: /uri

              " - "````````````````" - "````````````````" -- "A [collapsed" +- A +- "[collapsed" - "reference link](" - "@)\nconsists of a" - "[link label]" - "that [matches] a" -- "[link reference" +- "[" +- link reference - "definition]" - elsewhere in the - "document," - followed by the -- "string `[]`" -- "." +- "string `[]`." - The contents of - the link label - are parsed as - "inlines," - which are used -- "as the link's" -- "text. The link'" +- "as the link'" +- s text. +- "The link'" - s URI and title - are - provided by the @@ -12646,33 +12851,36 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n[]

              " - "````````````````" - "````````````````" -- "A [shortcut" +- A +- "[shortcut" - "reference link](" - "@)\nconsists of a" - "[link label]" - "that [matches] a" -- "[link reference" +- "[" +- link reference - "definition]" - elsewhere in the - document and is - not followed by -- "`[]` or a link" -- label. +- "`[]`" +- or a link label. - The contents of - the link label - are parsed as - "inlines," - which are used -- "as the link's" -- "text. The link'" +- "as the link'" +- s text. +- "The link'" - s URI and title - are provided by - the matching - link reference - definition. -- "Thus, `[foo]` is" -- "equivalent to `[" -- "foo][]`." +- "Thus, `[foo]`" +- is equivalent to +- "`[foo][]`." - "````````````````" - "````````````````" - example @@ -12830,11 +13038,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "````````````````" - In the following -- "case `[bar][baz]" -- "` is parsed as a" +- case +- "`[bar][baz]`" +- is parsed as a - "reference," -- "`[foo]` as" -- "normal text:" +- "`[foo]`" +- "as normal text:" - "````````````````" - "````````````````" - example @@ -12845,12 +13054,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - p> - "````````````````" - "````````````````" -- "Here, though, `[" -- "foo][bar]` is" -- parsed as a +- "Here, though," +- "`[foo][bar]`" +- is parsed as a - "reference, since" -- "`[bar]` is" -- "defined:" +- "`[bar]`" +- "is defined:" - "````````````````" - "````````````````" - example @@ -12863,9 +13072,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

              " - "````````````````" - "````````````````" -- "Here `[foo]` is" -- not parsed as a -- shortcut +- "Here `[foo]`" +- is not parsed as +- a shortcut - "reference," - because it - is followed by a @@ -12889,23 +13098,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the syntax for - "links, with one" - difference. -- "Instead of [link" -- "text], we have" -- an +- "Instead of [" +- "link text]" +- ", we have an" - "[image" - "description](@)" - "." - The rules for - this are the - "same as for [" -- "link text]," -- except that (a) -- an +- "link text]" +- ", except that (a" +- ) an - image - description - "starts with `![`" -- "rather than `[`," -- and +- "rather than `[`" +- ", and" - (b) an image - description may - contain links. @@ -12918,8 +13127,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "," - this is - standardly used -- "as the image's `" -- "alt` attribute." +- "as the image's" +- "`alt` attribute." - "````````````````" - "````````````````" - example @@ -12979,14 +13188,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - plain string - "content\nof the [" - image -- "description] be" -- used. +- "description]" +- be used. - Note that in - the above - "example, the alt" -- "attribute's" -- "value is `foo" -- "bar`, not" +- "attribute'" +- s value is +- "`foo bar`, not" - "`foo" - "[bar](/url)` or" - "`foo `" @@ -13255,50 +13464,51 @@ input_file: tests/inputs/markdown/commonmark_spec.md - address - as the link - label. -- "A [URI autolink]" -- (@) consists of -- "`<`, followed by" -- "an\n[absolute URI" -- "] followed by `>" -- "`" +- A +- "[URI autolink](@" +- ) consists of +- "`<`" +- ", followed by an" +- "[absolute URI]" +- "followed by `>`" - "." - It is parsed as - a link to the - "URI, with the" -- URI as the -- "link's label." -- "An [absolute URI" -- "](@)," +- "URI as the link'" +- s label. +- An +- "[absolute URI](@" +- ")," - for these - "purposes," - "consists of a [" -- "scheme] followed" -- "by a colon (`:`" -- ) +- "scheme]" +- followed by a +- "colon (`:`)" - followed by zero - or more - characters other -- "than [ASCII" -- control +- "than [" +- ASCII control - "characters][" - ASCII control - "character], [" - "space], `<`, and" -- "`>`" -- "." +- "`>`." - If the URI - includes these - "characters, they" - must be percent- - "encoded\n(e.g." -- "`%20` for a" -- space). +- "`%20`" +- for a space). - For purposes of -- "this spec, a [" -- "scheme](@) is" -- any sequence -- of 2--32 -- characters +- "this spec, a" +- "[scheme](@)" +- is any sequence +- of 2-- +- 32 characters - beginning with - an ASCII letter - and followed @@ -13307,9 +13517,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "ASCII letters," - "digits, or the" - "symbols plus\n(\"+" -- "\"), period (\".\")" -- ", or hyphen (\"-\"" -- ). +- "\"), period (\".\"" +- "), or hyphen (\"-" +- "\")." - Here are some - "valid autolinks:" - "````````````````" @@ -13460,26 +13670,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - /a>

              - "````````````````" - "````````````````" -- "An [email" -- "autolink](@)" -- "consists of `<`," -- "followed by an [" -- "email address]," +- An +- "[email autolink]" +- "(@)\nconsists of" +- "`<`" +- ", followed by an" +- "[email address]," - "followed by `>`" - ". The link'" - s label is the - "email address," -- "and the URL is `" -- "mailto:`" +- and the URL is +- "`mailto:`" - followed by the - email address. -- "An [email" -- "address](@)," +- An +- "[email address](" +- "@)," - for these - "purposes, is" - anything that - "matches\nthe" -- "[non-normative" +- "[" +- non-normative - regex from the - "HTML5\nspec" - "](https://" @@ -13600,9 +13813,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "## Raw HTML" - "Text between `<`" -- "and `>` that" -- looks like an -- HTML tag is +- "and `>`" +- that looks like +- an HTML tag is - parsed as a - raw HTML tag and - will be rendered @@ -13628,8 +13841,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "letters, digits," - "or\nhyphens (`-`" - ). -- "An [attribute](@" -- ) consists of +- An +- "[attribute](@)" +- consists of - "spaces, tabs," - and up to one - "line ending,\nan" @@ -13638,16 +13852,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "optional\n[" - attribute value - "specification]." -- "An [attribute" -- "name](@)" +- An +- "[attribute name]" +- (@) - consists of an -- "ASCII letter, `_" -- "`, or `:`," -- followed by zero -- or more ASCII +- "ASCII letter," +- "`_`, or `:`" +- ", followed by" +- zero or more +- ASCII - "letters, digits," -- "`_`, `.`, `:`," -- "or `-`" +- "`_`, `.`, `:`" +- ", or `-`" - "." - "(Note: This is" - the XML @@ -13655,33 +13871,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - restricted to - ASCII. - HTML5 is laxer.) -- "An [attribute" -- value +- An +- "[attribute value" - "specification](@" - ) - consists of - "optional spaces," - "tabs, and up to" - "one line ending," -- "a `=` character," +- "a `=`" +- "character," - "optional spaces," - "tabs, and up to" - "one line ending," - "and an [" - "attribute value]" - "." -- "An [attribute" -- "value](@)" +- An +- "[attribute value" +- "](@)" - "consists of an [" - unquoted - "attribute value]" - ",\na [" - single-quoted - "attribute value]" -- ", or a [double-" -- quoted attribute -- "value]." -- "An [unquoted" +- ", or a [" +- double-quoted +- "attribute value]" +- "." +- An +- "[unquoted" - "attribute value]" - (@) - is a nonempty @@ -13689,57 +13909,65 @@ input_file: tests/inputs/markdown/commonmark_spec.md - characters not - including spaces - ", tabs, line" -- "endings, `\"`, `'" -- "`, `=`, `<`, `>`" -- ", or `` ` ``." -- "A [single-quoted" +- "endings, `\"`," +- "`'`, `=`, `<`," +- "`>`, or `` ` ``." +- A +- "[single-quoted" - "attribute value]" - "(@)\nconsists of" -- "`'`, zero or" -- more +- "`'`" +- ", zero or more" - characters not -- "including `'`," -- "and a final `'`." -- "A [double-quoted" +- "including `'`" +- ", and a final" +- "`'`." +- A +- "[double-quoted" - "attribute value]" - "(@)\nconsists of" -- "`\"`, zero or" -- more +- "`\"`" +- ", zero or more" - characters not -- "including `\"`," -- "and a final `\"`." +- "including `\"`" +- ", and a final" +- "`\"`." - "An [open tag](@)" -- "consists of a `<" -- "` character, a [" -- "tag name]," +- consists of a +- "`<` character, a" +- "[tag name]," - "zero or more [" -- "attributes]," -- "optional spaces," -- "tabs, and up to" -- "one line ending," +- "attributes]" +- ", optional" +- "spaces, tabs," +- and up to one +- "line ending," - "an optional `/`" - "character, and a" - "`>` character." -- "A [closing tag](" -- "@) consists of" -- "the string ``." -- "An [HTML comment" -- "](@) consists of" -- "``, `" -- "`, or ``," +- "``, or" +- "``," -- "and `-->` (see" -- the +- "string `-->`" +- ", and `-->`" +- (see the - "[HTML spec](" - "https://" - html.spec.whatwg @@ -13748,7 +13976,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - markup- - declaration-open - "-state))." -- "A [processing" +- A +- "[processing" - "instruction](@)" - consists of the - "string ``" - ", and the string" - "`?>`." -- "A [declaration](" -- "@) consists of" -- "the string ``," -- and the +- "character `>`" +- ", and the" - "character `>`." -- "A [CDATA section" -- "](@) consists of" +- A +- "[CDATA section](" +- "@) consists of" - the string - "` - "````````````````" - "````````````````" -- "## Hard line" -- breaks +- "##" +- Hard line breaks - A line ending ( - not in a code - span or HTML tag @@ -14047,11 +14279,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - spaces and does - not occur at the - end of a block -- "is parsed as a [" -- "hard line break]" -- (@) (rendered -- "in HTML as a `<" -- "br />` tag):" +- is parsed as a +- "[hard line break" +- "](@) (rendered" +- in HTML as a +- "`
              ` tag):" - "````````````````" - "````````````````" - example @@ -14065,7 +14297,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "alternative, a" - backslash before - "the\n[line ending" -- "] may be used" +- "]" +- may be used - instead of two - "or more spaces:" - "````````````````" @@ -14209,8 +14442,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

              foo

              " - "````````````````" - "````````````````" -- "## Soft line" -- breaks +- "##" +- Soft line breaks - A regular line - ending (not in a - code span or @@ -14226,15 +14459,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - break may be - rendered in HTML - "either as a\n[" -- "line ending] or" -- as a space. +- "line ending]" +- or as a space. - The result will - be the same in - browsers. - In the examples -- "here, a [line" -- "ending] will be" -- used.) +- "here, a [" +- "line ending]" +- will be used.) - "````````````````" - "````````````````" - example @@ -14267,8 +14500,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - soft line breaks - as hard line - breaks. -- "## Textual" -- content +- "##" +- Textual content - Any characters - not given an - interpretation @@ -14307,7 +14540,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````" - "`.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``.\n\n6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line].\n\n" -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``.\n\n" +- "6. " +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" +- " line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line].\n\n" - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
    ` within an HTML block started by `
    ` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n\n```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "\n\nIn this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n\nAll types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
    \n hi\n
    \n\nokay.\n.\n\n \n \n \n
    \n hi\n
    \n

    okay.

    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n
    \n*foo*\n````````````````````````````````\n\n\nHere we have two HTML blocks with a Markdown paragraph between them:\n\n```````````````````````````````` example\n
    \n\n*Markdown*\n\n
    \n.\n
    \n

    Markdown

    \n
    \n````````````````````````````````\n\n\nThe tag on the first line can be partial, as long\nas it is split where there would be whitespace:\n" @@ -138,8 +140,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n```````````````````````````````` example\n> > > foo\nbar\n.\n
    \n
    \n
    \n

    foo\nbar

    \n
    \n
    \n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
    \n
    \n
    \n

    foo\nbar\nbaz

    \n
    \n
    \n
    \n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:\n" - "\n```````````````````````````````` example\n> code\n\n> not code\n.\n
    \n
    code\n
    \n
    \n
    \n

    not code

    \n
    \n````````````````````````````````\n\n\n\n" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:\n\n" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n" -- " the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n" +- "\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n````````````````````````````````" - "\n\n\nThe most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n\nHere are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" - "\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • one
    • \n
    \n
     two\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````\n\n\nIt is tempting to think of this in terms of columns: the continuation\nblocks must be indented at least to the column of the first character other than\na space or tab after the list marker. However, that is not quite right.\nThe spaces of indentation after the list marker determine how much relative\nindentation is needed. Which column this indentation reaches will depend on\nhow the list item is embedded in other constructions, as shown by\nthis example:\n" @@ -203,25 +205,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\nFirst, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by\na non-backslash-escaped `_` character.\n\nA [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n" - "\nA [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```\n\n" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n" +- "6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n" - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3.\n\n" - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. " -- "So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*
    foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" -- "\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n" -- "\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````\n\n\nRule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:\n" -- "\n```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:\n" -- "\n```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:\n" -- "\n```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n" -- "\n```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:\n" -- "\n```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````" -- "\n\n\n```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````" -- "\n\n\nThis is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:\n\n```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n" -- "\n```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````" -- "\n\n\n```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````\n\n\nThis is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n" -- "\n```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith these examples:\n" -- "\n```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:\n" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n" +- "17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`.\n\n" +- "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n" +- "\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````" +- "\n\n\nRule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````" +- "\n\n\n```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:\n\n```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n" +- "\n```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````" +- "\n\n\nThis is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n\n```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:\n" +- "\n```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````" +- "\n\n\n```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````" +- "\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:\n" +- "\n```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````" +- "\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````" +- "\n\n\nThis is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````" +- "\n\n\nThe point of this restriction is more easily appreciated\nwith these examples:\n\n```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:\n" - "\n```````````````````````````````` example\n**foo**bar\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 8:\n\nThis is not strong emphasis, because the closing delimiter is\npreceded by whitespace:\n\n```````````````````````````````` example\n__foo bar __\n.\n

    __foo bar __

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the second `__` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n__(__foo)\n.\n

    __(__foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n" - "\n```````````````````````````````` example\n_(__foo__)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\n__foo__bar\n.\n

    __foo__bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__пристаням__стремятся\n.\n

    __пристаням__стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo__bar__baz__\n.\n

    foo__bar__baz

    \n````````````````````````````````" - "\n\n\nThis is strong emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n__(bar)__.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 9:\n\nAny nonempty sequence of inline elements can be the contents of an\nemphasized span.\n\n```````````````````````````````` example\n*foo [bar](/url)*\n.\n

    foo bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n*foo\nbar*\n.\n

    foo\nbar

    \n````````````````````````````````\n\n\nIn particular, emphasis and strong emphasis can be nested\ninside emphasis:\n" @@ -331,5 +333,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter.\n\n" - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n" +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:\n\n " +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap index 24a1460..804d6f5 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@commonmark_spec.md.snap @@ -3,7 +3,8 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/commonmark_spec.md --- -- "---\ntitle: CommonMark Spec\nauthor: John MacFarlane\n" +- "---\n" +- "title: CommonMark Spec\nauthor: John MacFarlane\n" - "version: '0.31.2'\n" - "date: '2024-01-28'\n" - "license: '[CC-BY-SA 4.0](https://creativecommons.org" @@ -16,8 +17,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - help from Aaron Swartz) and released in 2004 in the form of - " a\n" - "[syntax description](https://daringfireball.net/projects/markdown/syntax" -- ")\nand a Perl script (`Markdown.pl`" -- ") for converting Markdown to\n" +- ")\nand a Perl script (`Markdown.pl`) for converting Markdown to\n" - "HTML. In the next decade, dozens of implementations were\n" - "developed in many languages. Some extended the original\n" - "Markdown syntax with conventions for footnotes, tables, and\n" @@ -29,17 +29,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "What distinguishes Markdown from many other lightweight markup\n" - "syntaxes, which are often easier to write, is its readability.\n" - "As Gruber writes:\n\n" -- "> The overriding design goal for Markdown's formatting syntax is\n" -- "> to make it as readable as possible. The idea is that a" -- "\n> Markdown-formatted document should be publishable as-is, as" -- "\n> plain text, without looking like it's been marked up with tags" -- "\n> or formatting instructions.\n> (" +- "> " +- "The overriding design goal for Markdown's formatting syntax is\n> " +- "to make it as readable as possible. The idea is that a\n> " +- "Markdown-formatted document should be publishable as-is, as\n> " +- "plain text, without looking like it's been marked up with tags\n> " +- "or formatting instructions.\n> (" - ")\n\n" - "The point can be illustrated by comparing a sample of\n" -- "[AsciiDoc](https://asciidoc.org/)" -- " with\nan equivalent sample of Markdown. Here is a sample of\n" +- "[AsciiDoc](https://asciidoc.org/) with\n" +- "an equivalent sample of Markdown. Here is a sample of\n" - "AsciiDoc from the AsciiDoc manual:\n\n" -- "```\n1. List item one.\n+\n" +- "```\n" +- "1. List item one.\n+\n" - "List item one continued with a second paragraph followed by an\nIndented block.\n+\n" - ".................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n" - "+\nList item continued with a third paragraph.\n\n2. " @@ -49,7 +51,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This paragraph is part of the preceding list item.\n\nb. List item b.\n\n" - "This paragraph belongs to item two of the outer list.\n--\n```\n\n" - "And here is the equivalent in Markdown:\n" -- "```\n1. List item one.\n\n" +- "```\n" +- "1. List item one.\n\n" - " List item one continued with a second paragraph followed by an\n" - " Indented block.\n\n $ ls *.sh\n" - " $ mv *.sh ~/tmp\n\n" @@ -60,96 +63,105 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " This paragraph is part of the preceding list item.\n\n 2. " - "List item b.\n\n This paragraph belongs to item two of the outer list.\n" - "```\n\n" -- "The AsciiDoc version is, arguably, easier to write. " -- "You don't need\n" -- "to worry about indentation. But the Markdown version is much easier\n" +- "The AsciiDoc version is, arguably, easier to write. You don'" +- "t need\nto worry about indentation. But the Markdown version is much easier" +- "\n" - to read. The nesting of list items is apparent to the eye in the - "\nsource, not just in the processed document.\n\n" - "## Why is a spec needed?\n\n" -- "John Gruber's [canonical description of Markdown's\n" -- "syntax](https://daringfireball.net/projects/markdown/syntax)\n" +- "John Gruber's " +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown/syntax)\n" - does not specify the syntax unambiguously. Here are some examples of - "\nquestions it does not answer:\n\n" -- "1. How much indentation is needed for a sublist? " -- "The spec says that\n " +- "1. " +- "How much indentation is needed for a sublist? The spec says that\n " - "continuation paragraphs need to be indented four spaces, but is\n " - "not fully explicit about sublists. It is natural to think that\n " - "they, too, must be indented four spaces, but `Markdown.pl`" - " does\n not require that. This is hardly a \"corner case,\"" - " and divergences\n between implementations on this issue often lead to surprises for" - "\n users in real documents. (See " -- "[this comment by John\n" -- " Gruber](https://web.archive.org/web/20170611172104" -- "/http://article.gmane.org/gmane.text.markdown.general/" -- "1997).)\n\n" -- "2. Is a blank line needed before a block quote or heading?\n" -- " Most implementations do not require the blank line. However,\n " +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/20170611172104/http://" +- article.gmane.org/gmane.text.markdown.general/1997) +- ".)\n\n" +- "2. " +- "Is a blank line needed before a block quote or heading?\n " +- "Most implementations do not require the blank line. However,\n " - "this can lead to unexpected results in hard-wrapped text, and\n " - "also to ambiguities in parsing (note that some implementations\n " - "put the heading inside the blockquote, while others do not).\n " - "(John Gruber has also spoken " -- "[in favor of requiring the blank\n" -- " lines](https://web.archive.org/web/20170611172104/http" -- "://article.gmane.org/gmane.text.markdown.general/2146" -- ").)\n\n" -- "3. Is a blank line needed before an indented code block?\n" -- " (`Markdown.pl`" -- " requires it, but this is not mentioned in the\n " +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/20170611172104/http://" +- article.gmane.org/gmane.text.markdown.general/2146) +- ".)\n\n" +- "3. " +- "Is a blank line needed before an indented code block?\n (" +- "`Markdown.pl` requires it, but this is not mentioned in the\n " - "documentation, and some implementations do not require it.)\n\n " - "``` markdown\n paragraph\n code?\n ```\n\n" -- "4. What is the exact rule for determining when list items get\n" -- " wrapped in `

    `" -- " tags? Can a list be partially \"loose\" and partially\n " -- "\"tight\"? What should we do with a list like this?\n\n " -- "``` markdown\n 1. one\n\n 2. two\n" -- " 3. three\n ```\n\n Or this?\n" +- "4. " +- "What is the exact rule for determining when list items get\n wrapped in " +- "`

    ` tags? Can a list be partially \"loose\"" +- " and partially\n \"tight\"" +- "? What should we do with a list like this?\n\n " +- "``` markdown\n 1. one\n\n 2. two\n " +- "3. three\n ```\n\n Or this?\n" - "\n ``` markdown\n 1. one\n - a\n\n" - " - b\n 2. two\n ```\n\n " -- "(There are some relevant comments by John Gruber\n" -- " [here](https://web.archive.org/web/20170611172104" -- "/http://article.gmane.org/gmane.text.markdown.general/" -- "2554).)\n\n" -- "5. Can list markers be indented? " -- "Can ordered list markers be right-aligned?\n\n " -- "``` markdown\n 8. item 1\n" -- " 9. item 2\n " -- "10. item 2a\n ```\n\n" -- "6. Is this one list with a thematic break in its second item,\n" -- " or two lists separated by a thematic break?\n\n " -- "``` markdown\n * a\n * * * * *\n" -- " * b\n ```\n\n" -- "7. When list markers change from numbers to bullets, do we have\n" -- " two lists or one? (The Markdown syntax description suggests two,\n " +- "(There are some relevant comments by John Gruber\n " +- "[here](https://web.archive.org/web/20170611172104/http" +- "://article.gmane.org/gmane.text.markdown.general/2554" +- ").)\n\n" +- "5. " +- "Can list markers be indented? Can ordered list markers be right-aligned?\n" +- "\n ``` markdown\n 8. item 1\n " +- " 9. item 2\n 10. item 2a\n" +- " ```\n\n" +- "6. " +- "Is this one list with a thematic break in its second item,\n " +- "or two lists separated by a thematic break?\n\n " +- "``` markdown\n * a\n * * * * *\n " +- "* b\n ```\n\n" +- "7. " +- "When list markers change from numbers to bullets, do we have\n " +- "two lists or one? (The Markdown syntax description suggests two,\n " - "but the perl scripts and many other implementations produce one.)\n\n " -- "``` markdown\n 1. fee\n 2. fie\n" -- " - foe\n - fum\n ```\n\n" -- "8. What are the precedence rules for the markers of inline structure?\n" -- " For example, is the following a valid link, or does the code span" +- "``` markdown\n 1. fee\n 2. fie\n " +- "- foe\n - fum\n ```\n\n" +- "8. " +- "What are the precedence rules for the markers of inline structure?\n " +- "For example, is the following a valid link, or does the code span" - "\n take precedence ?\n\n " -- "``` markdown\n" -- " [a backtick (`)](/url) and [another backtick (`" -- ")](/url).\n ```\n\n" -- "9. What are the precedence rules for markers of emphasis and strong\n" -- " emphasis? For example, how should the following be parsed?\n\n " +- "``` markdown\n " +- "[a backtick (`)](/url) and [another backtick (`)](/" +- "url).\n ```\n\n" +- "9. " +- "What are the precedence rules for markers of emphasis and strong\n " +- "emphasis? For example, how should the following be parsed?\n\n " - "``` markdown\n *foo *bar* baz*\n ```\n\n" -- "10. What are the precedence rules between block-level and inline-level\n" -- " structure? For example, how should the following be parsed?\n\n " -- "``` markdown\n" -- " - `a long code span can contain a hyphen like this\n " +- "10. " +- "What are the precedence rules between block-level and inline-level\n " +- "structure? For example, how should the following be parsed?\n\n " +- "``` markdown\n " +- "- `a long code span can contain a hyphen like this\n " - " - and it can screw things up`\n ```\n\n" -- "11. Can list items include section headings? " -- "(`Markdown.pl`" -- " does not\n allow this, but does allow blockquotes to include headings.)" -- "\n\n ``` markdown\n - # Heading\n ```\n\n" -- "12. Can list items be empty?\n\n ``` markdown\n * a\n" -- " *\n * b\n ```\n\n" -- "13. Can link references be defined inside block quotes or list items?\n\n" -- " ``` markdown\n > Blockquote [foo].\n >\n" -- " > [foo]: /url\n ```\n\n" -- "14. If there are multiple definitions for the same reference, which takes\n" -- " precedence?\n\n " -- "``` markdown\n [foo]: /url1\n" -- " [foo]: /url2\n\n [foo][]\n ```\n\n" +- "11. " +- "Can list items include section headings? (`Markdown.pl` does not\n " +- "allow this, but does allow blockquotes to include headings.)\n\n " +- "``` markdown\n - # Heading\n ```\n\n" +- "12. Can list items be empty?\n" +- "\n ``` markdown\n * a\n *\n * b\n" +- " ```\n\n" +- "13. Can link references be defined inside block quotes or list items?\n" +- "\n ``` markdown\n > Blockquote [foo].\n >\n " +- "> [foo]: /url\n ```\n\n" +- "14. " +- "If there are multiple definitions for the same reference, which takes\n precedence?\n" +- "\n ``` markdown\n [foo]: /url1\n " +- "[foo]: /url2\n\n [foo][]\n ```\n\n" - "In the absence of a spec, early implementers consulted `Markdown.pl`\n" - "to resolve these ambiguities. But `Markdown.pl`" - " was quite buggy, and\n" @@ -191,49 +203,50 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "# Preliminaries\n\n" - "## Characters and lines\n\n" - "Any sequence of [characters] is a valid CommonMark\ndocument.\n" -- "\nA [character](@) is a Unicode code point. " -- "Although some\ncode points (for example, combining accents) do not correspond to" -- "\ncharacters in an intuitive sense, all code points count as characters\n" +- "\nA [character](@) is a Unicode code point. Although some" +- "\ncode points (for example, combining accents) do not correspond to\n" +- "characters in an intuitive sense, all code points count as characters\n" - "for purposes of this spec.\n\n" - "This spec does not specify an encoding; it thinks of lines as composed\n" - "of [characters] rather than bytes. A conforming parser may be limited" - "\nto a certain encoding.\n\n" - "A [line](@) is a sequence of zero or more [characters]\n" -- "other than line feed (`U+000A`) or carriage return (`U+" -- "000D`),\nfollowed by a [line ending]" +- "other than line feed (`U+000A`) or carriage return (" +- "`U+000D`),\nfollowed by a [line ending]" - " or by the end of file.\n\n" -- "A [line ending](@) is a line feed (`U+000A" -- "`), a carriage return\n(`U+000D`" +- "A [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`" - ") not followed by a line feed, or a carriage return and a\n" - "following line feed.\n\n" -- "A line containing no characters, or a line containing only spaces\n" -- "(`U+0020`) or tabs (`U+0009`), is" -- " called a [blank line](@).\n\n" +- "A line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (`U+0009`" +- "), is called a [blank line](@).\n\n" - "The following definitions of character classes will be used in this spec:\n" -- "\n" -- "A [Unicode whitespace character](@) is a character in the Unicode `" -- "Zs` general\ncategory, or a tab (`U+0009`" -- "), line feed (`U+000A`), form feed (`U+" -- "000C`), or\ncarriage return (`U+000D`).\n\n" -- "[Unicode whitespace](@) is a sequence of one or more\n" -- "[Unicode whitespace characters].\n\n" +- "\nA [Unicode whitespace character](@) is a character in the Unicode " +- "`Zs` general\ncategory, or a tab (" +- "`U+0009`), line feed (`U+000A`" +- "), form feed (`U+000C`), or\ncarriage return (" +- "`U+000D`).\n\n" +- "[Unicode whitespace](@) is a sequence of one or more\n[" +- "Unicode whitespace characters].\n\n" - "A [tab](@) is `U+0009`.\n" - "\nA [space](@) is `U+0020`.\n" -- "\n" -- "An [ASCII control character](@) is a character between `U+" -- "0000–1F` (both\nincluding) or " +- "\nAn [ASCII control character](@) is a character between " +- "`U+0000–1F` (both\nincluding) or " - "`U+007F`.\n\n" -- "An [ASCII punctuation character](@)\nis `!" -- "`, `\"`, `#`, `$`, `%`, `&`, `'`, `(" -- "`, `)`,\n`*`, `+`, `,`, `-`, `.`" -- ", `/` (U+0021–2F), \n`:`, " -- "`;`, `<`, `=`, `>`, `?`, `@`" +- "An [ASCII punctuation character](@)\nis `!`, `\"`, " +- "`#`, `$`, `%`, `&`, `'`, `(`, " +- "`)`,\n`*`, `+`, `,`, `-`, `.`, `/`" +- " (U+0021–2F), \n`:`, `;`, " +- "`<`, `=`, `>`, `?`, `@`" - " (U+003A–0040),\n`[`, `\\`, " -- "`]`, `^`, `_`, `` ` `` (U+005B–" -- "0060), \n`{`, `|`, `}`, or `~`" -- " (U+007B–007E).\n\n" -- "A [Unicode punctuation character](@) is a character in the Unicode `P" -- "`\n(puncuation) or `S` (symbol) general categories.\n\n" +- "`]`, `^`, `_`, `` ` ``" +- " (U+005B–0060), \n`{`, `|`" +- ", `}`, or `~` (U+007B–007E).\n\n" +- "A [Unicode punctuation character](@) is a character in the Unicode " +- "`P`\n(puncuation) or `S`" +- " (symbol) general categories.\n\n" - "## Tabs\n\n" - "Tabs in lines are not expanded to [spaces]. However,\n" - "in contexts where spaces help to define block structure,\n" @@ -280,8 +293,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\nNormally the `>` that begins a block quote may be followed\n" - "optionally by a space, which is not considered part of the\n" -- "content. In the following case `>`" -- " is followed by a tab,\n" +- "content. In the following case `>` is followed by a tab,\n" - "which is treated as if it were expanded into three spaces.\n" - "Since one of these spaces is considered part of the\ndelimiter, `foo`" - " is considered to be indented six spaces\n" @@ -363,9 +375,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " example\n" - "\\\\*emphasis*\n.\n

    \\emphasis

    \n" - "````````````````````````````````\n" -- "\n\n" -- "A backslash at the end of the line is a [hard line break]:\n" -- "\n" +- "\n\nA backslash at the end of the line is a [hard line break" +- "]:\n\n" - "````````````````````````````````" - " example\nfoo\\\nbar\n.\n

    foo
    \nbar

    \n" - "````````````````````````````````\n" @@ -427,12 +438,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Entity and numeric character references\n\n" - "Valid HTML entity references and numeric character references\n" - "can be used in place of the corresponding Unicode character,\nwith the following exceptions:\n\n" -- "- Entity and character references are not recognized in code\n" -- " blocks and code spans.\n\n" -- "- Entity and character references cannot stand in place of\n" -- " special characters that define structural elements in\n " -- "CommonMark. For example, although `*`" -- " can be used\n in place of a literal `*` character, " +- "- " +- "Entity and character references are not recognized in code\n blocks and code spans.\n\n" +- "- " +- "Entity and character references cannot stand in place of\n " +- "special characters that define structural elements in\n " +- "CommonMark. For example, although `*` can be used" +- "\n in place of a literal `*` character, " - "`*` cannot replace\n `*`" - " in emphasis delimiters, bullet list markers, or thematic\n breaks.\n\n" - "Conforming CommonMark parsers need not store information about\n" @@ -440,8 +452,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "using a Unicode character or an entity reference.\n\n" - "[Entity references](@) consist of `&` + any of the valid" - "\nHTML5 entity names + `;`. The\ndocument " -- "" -- "\nis used as an authoritative source for the valid entity\n" +- "\n" +- "is used as an authoritative source for the valid entity\n" - "references and their corresponding code points.\n\n" - "````````````````````````````````" - " example\n" @@ -452,23 +464,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

      & © Æ Ď\n" - "¾ ℋ ⅆ\n∲ ≧̸

    \n" - "````````````````````````````````\n" -- "\n\n[Decimal numeric character\nreferences](@)\n" -- "consist of `&#` + a string of 1--7 arabic" -- " digits + `;`" -- ". A\nnumeric character reference is parsed as the corresponding\n" +- "\n\n[Decimal numeric character\nreferences](@)\nconsist of `&#`" +- " + a string of 1--7 arabic digits + `;`. A" +- "\nnumeric character reference is parsed as the corresponding\n" - "Unicode character. Invalid Unicode code points will be replaced by\n" -- "the REPLACEMENT CHARACTER (`U+FFFD`" -- "). For security reasons,\nthe code point `U+0000`" -- " will also be replaced by `U+FFFD`.\n\n" +- "the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons,\n" +- "the code point `U+0000` will also be replaced by " +- "`U+FFFD`.\n\n" - "````````````````````````````````" - " example\n" - "# Ӓ Ϡ �\n.\n" - "

    # Ӓ Ϡ �

    \n" - "````````````````````````````````\n" -- "\n\n[Hexadecimal numeric character\n" -- "references](@) consist of `&#` +\neither `X` or " -- "`x` + a string of 1-6 hexadecimal digits + `;" -- "`.\nThey too are parsed as the corresponding Unicode character (this\n" +- "\n\n[Hexadecimal numeric character\nreferences](@) consist of `&#`" +- " +\neither `X` or `x`" +- " + a string of 1-6 hexadecimal digits + `;`.\n" +- "They too are parsed as the corresponding Unicode character (this\n" - "time specified with a hexadecimal numeral instead of decimal).\n\n" - "````````````````````````````````" - " example\n" @@ -576,27 +587,24 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "# Blocks and inlines\n\n" -- "We can think of a document as a sequence of\n" -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block\n" - "quotations, lists, headings, rules, and code blocks. " - "Some blocks (like\n" - "block quotes and list items) contain other blocks; others (like\n" -- "headings and paragraphs) contain [inline](@)" -- " content---text,\n" +- "headings and paragraphs) contain [inline](@) content---text,\n" - "links, emphasized text, images, code spans, and so on.\n\n" - "## Precedence\n\n" -- "Indicators of block structure always take precedence over indicators\nof inline structure. " -- "So, for example, the following is a list with\n" -- "two items, not a list with one item containing a code span:\n\n" +- "Indicators of block structure always take precedence over indicators\n" +- "of inline structure. So, for example, the following is a list with" +- "\ntwo items, not a list with one item containing a code span:\n\n" - "````````````````````````````````" - " example\n" - "- `one\n- two`\n.\n
      \n
    • `one
    • \n" - "
    • two`
    • \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "This means that parsing can proceed in two steps: first, the block\n" -- "structure of the document can be discerned; second, text lines inside\n" +- "\n\nThis means that parsing can proceed in two steps: first, the block" +- "\nstructure of the document can be discerned; second, text lines inside\n" - "paragraphs, headings, and other block constructs can be parsed for inline\n" - "structure. The second step requires information about link reference\n" - "definitions that will be available only at the end of the first\n" @@ -605,16 +613,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one block element does not affect the inline parsing of any other.\n\n" - "## Container blocks and leaf blocks\n\n" - "We can divide blocks into two types:\n" -- "[container blocks](#container-blocks)" -- ",\nwhich can contain other blocks, and " -- "[leaf blocks](#leaf-blocks),\nwhich cannot.\n\n" +- "[container blocks](#container-blocks),\n" +- "which can contain other blocks, and [leaf blocks](#leaf-blocks),\n" +- "which cannot.\n\n" - "# Leaf blocks\n\n" - "This section describes the different kinds of leaf block that make up a\n" - "Markdown document.\n\n" - "## Thematic breaks\n\n" - "A line consisting of optionally up to three spaces of indentation, followed by a\n" -- "sequence of three or more matching `-`, `_`, or `*` characters," -- " each followed\noptionally by any number of spaces or tabs, forms a\n" +- "sequence of three or more matching `-`, `_`, or `*`" +- " characters, each followed\n" +- "optionally by any number of spaces or tabs, forms a\n" - "[thematic break](@).\n\n" - "````````````````````````````````" - " example\n***\n---\n___\n.\n
    \n
    \n
    \n" @@ -729,8 +738,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "An [ATX heading](@)\n" - "consists of a string of characters, parsed as inline content, between an\n" - "opening sequence of 1--6 unescaped `#` characters and an optional" -- "\nclosing sequence of any number of unescaped `#`" -- " characters.\nThe opening sequence of `#`" +- "\nclosing sequence of any number of unescaped `#` characters.\n" +- "The opening sequence of `#`" - " characters must be followed by spaces or tabs, or\n" - "by the end of line. The optional closing sequence of `#`" - "s must be preceded by\n" @@ -754,9 +763,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n####### foo\n.\n

    ####### foo

    \n" - "````````````````````````````````\n" -- "\n\n" -- "At least one space or tab is required between the `#` characters and the" -- "\nheading's contents, unless the heading is empty. Note that many\n" +- "\n\nAt least one space or tab is required between the `#`" +- " characters and the\nheading'" +- "s contents, unless the heading is empty. Note that many\n" - "implementations currently do not require the space. However, the\n" - "space was required by the\n" - "[original ATX implementation](http://www.aaronsw.com/2002" @@ -823,10 +832,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n### foo ### \n.\n

    foo

    \n" - "````````````````````````````````\n" -- "\n\n" -- "A sequence of `#` characters with anything but spaces or tabs following it\n" -- "is not a closing sequence, but counts as part of the contents of the\n" -- "heading:\n\n" +- "\n\nA sequence of `#` characters with anything but spaces or tabs following it" +- "\nis not a closing sequence, but counts as part of the contents of the" +- "\nheading:\n\n" - "````````````````````````````````" - " example\n" - "### foo ### b\n.\n

    foo ### b

    \n" @@ -877,12 +885,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "interpretable as a [code fence], [ATX heading][ATX headings" - "],\n[block quote][block quotes], [thematic break][thematic breaks],\n" - "[list item][list items], or [HTML block][HTML blocks].\n\n" -- "A [setext heading underline](@) is a sequence of\n" -- "`=` characters or a sequence of `-` characters, with no more than " -- "3\nspaces of indentation and any number of trailing spaces or tabs.\n\n" +- "A [setext heading underline](@) is a sequence of\n`=`" +- " characters or a sequence of `-` characters, with no more than 3\n" +- "spaces of indentation and any number of trailing spaces or tabs.\n\n" - "The heading is a level 1 heading if `=` characters are used in\n" -- "the [setext heading underline], and a level 2 heading if `-`" -- "\ncharacters are used. The contents of the heading are the result\n" +- "the [setext heading underline], and a level 2 heading if `-`\n" +- "characters are used. The contents of the heading are the result\n" - "of parsing the preceding lines of text as CommonMark inline\ncontent.\n\n" - "In general, a setext heading need not be preceded or followed by a\n" - "blank line. However, it cannot interrupt a paragraph, so when a\n" @@ -917,9 +925,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Foo\n-------------------------\n\nFoo\n=\n.\n

    Foo

    \n" - "

    Foo

    \n" - "````````````````````````````````\n" -- "\n\n" -- "The heading content can be preceded by up to three spaces of indentation, and\n" -- "need not line up with the underlining:\n\n" +- "\n\nThe heading content can be preceded by up to three spaces of indentation, and" +- "\nneed not line up with the underlining:\n\n" - "````````````````````````````````" - " example\n" - " Foo\n---\n\n Foo\n-----\n\n Foo\n ===\n.\n" @@ -970,8 +977,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    <a title="a lot

    \n" - "

    of dashes"/>

    \n" - "````````````````````````````````\n" -- "\n\nThe setext heading underline cannot be a [lazy continuation\n" -- "line] in a list item or block quote:\n\n" +- "\n\nThe setext heading underline cannot be a [lazy continuation\nline]" +- " in a list item or block quote:\n\n" - "````````````````````````````````" - " example\n" - "> Foo\n---\n.\n
    \n

    Foo

    \n" @@ -1033,9 +1040,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n-----\n.\n
    \n

    foo

    \n" - "
    \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "If you want a heading with `> foo` as its literal text, you" -- " can\nuse backslash escapes:\n\n" +- "\n\nIf you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:\n\n" - "````````````````````````````````" - " example\n" - "\\> foo\n------\n.\n

    > foo

    \n" @@ -1083,8 +1089,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Indented code blocks\n\n" - "An [indented code block](@) is composed of one or more\n" - "[indented chunks] separated by blank lines.\nAn " -- "[indented chunk](@)" -- " is a sequence of non-blank lines,\n" +- "[indented chunk](@) is a sequence of non-blank lines,\n" - "each preceded by four or more spaces of indentation. The contents of the code\n" - "block are the literal contents of the lines, including trailing\n[line endings]" - ", minus four spaces of indentation.\nAn indented code block has no [" @@ -1115,9 +1120,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n
      \n
    • bar
    • \n" - "
    \n\n\n" - "````````````````````````````````\n" -- "\n\n\n" -- "The contents of a code block are literal text, and do not get parsed\n" -- "as Markdown:\n\n" +- "\n\n\nThe contents of a code block are literal text, and do not get parsed" +- "\nas Markdown:\n\n" - "````````````````````````````````" - " example\n" - " \n *hi*\n\n - one\n.\n" @@ -1144,9 +1148,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\nFoo\n bar\n\n.\n

    Foo\nbar

    \n" - "````````````````````````````````\n" -- "\n\n" -- "However, any non-blank line with fewer than four spaces of indentation ends\n" -- "the code block immediately. So a paragraph may occur immediately\n" +- "\n\nHowever, any non-blank line with fewer than four spaces of indentation ends" +- "\nthe code block immediately. So a paragraph may occur immediately\n" - "after indented code:\n\n" - "````````````````````````````````" - " example\n" @@ -1184,16 +1187,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n\n" - "## Fenced code blocks\n\n" - "A [code fence](@) is a sequence\n" -- "of at least three consecutive backtick characters (`` ` ``" -- ") or\ntildes (`~`" +- "of at least three consecutive backtick characters (`` ` ``) or\n" +- "tildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA " -- "[fenced code block](@)" -- "\n" +- "[fenced code block](@)\n" - "begins with a code fence, preceded by up to three spaces of indentation.\n\n" - "The line with the opening code fence may optionally contain some text\n" - "following the code fence; this is trimmed of leading and trailing\n" -- "spaces or tabs and called the [info string](@)" -- ". If the [info string] comes\n" +- "spaces or tabs and called the [info string](@). If the [" +- "info string] comes\n" - "after a backtick fence, it may not contain any backtick\n" - "characters. (The reason for this restriction is that otherwise\n" - "some inline code would be incorrectly interpreted as the\n" @@ -1227,8 +1229,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The content of a code fence is treated as literal text, not parsed\n" - "as inlines. The first word of the [info string]" - " is typically used to\n" -- "specify the language of the code sample, and rendered in the `class`" -- "\nattribute of the `code`" +- "specify the language of the code sample, and rendered in the `class`\n" +- "attribute of the `code`" - " tag. However, this spec does not mandate any\n" - "particular treatment of the [info string].\n\n" - "Here is a simple example with backticks:\n" @@ -1278,8 +1280,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "~~~\n
    \n" - "````````````````````````````````\n" - "\n\nUnclosed code blocks are closed by the end of the document\n" -- "(or the enclosing [block quote][block quotes] or [list item][list" -- " items]):\n\n" +- "(or the enclosing [block quote][block quotes] or [list item][" +- "list items]):\n\n" - "````````````````````````````````" - " example\n```\n.\n
    \n" - "````````````````````````````````\n" @@ -1307,8 +1309,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n```\n```\n.\n
    \n" - "````````````````````````````````\n" -- "\n\nFences can be indented. " -- "If the opening fence is indented,\n" +- "\n\n" +- "Fences can be indented. If the opening fence is indented,\n" - "content lines will have equivalent opening indentation removed,\nif present:\n\n" - "````````````````````````````````" - " example\n" @@ -1334,9 +1336,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " ```\n aaa\n ```\n.\n
    ```\naaa\n"
     - "```\n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "Closing fences may be preceded by up to three spaces of indentation, and their\n" -- "indentation need not match that of the opening fence:\n\n" +- "\n\nClosing fences may be preceded by up to three spaces of indentation, and their" +- "\nindentation need not match that of the opening fence:\n\n" - "````````````````````````````````" - " example\n" - "```\naaa\n ```\n.\n
    aaa\n"
    @@ -1417,9 +1418,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "``` aa ```\nfoo\n.\n

    aa\n" - "foo

    \n" - "````````````````````````````````\n" -- "\n\n" -- "[Info strings] for tilde code blocks can contain backticks and " -- "tildes:\n\n" +- "\n\n[Info strings]" +- " for tilde code blocks can contain backticks and tildes:\n\n" - "````````````````````````````````" - " example\n" - "~~~ aa ``` ~~~\nfoo\n~~~\n.\n" @@ -1438,66 +1438,70 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as raw HTML (and will not be escaped in HTML output).\n\n" - "There are seven kinds of [HTML block], which can be defined by their\n" - "start and end conditions. The block begins with a line that meets a\n" -- "[start condition](@)" -- " (after up to three optional spaces of indentation).\n" +- "[start condition](@) (after up to three optional spaces of indentation).\n" - "It ends with the first subsequent line that meets a matching\n" -- "[end condition](@), or the last line of the document, or the last" -- " line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the [" -- "end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line.\n\n" -- "1. **Start condition:** line begins with the string ``" -- ", or the end of the line.\\\n**End condition:**" +- "[end condition](@)" +- ", or the last line of the document, or the last line of\n" +- "the [container block](#container-blocks) containing the current HTML\n" +- "block, if no line is encountered that meets the [end condition]. If" +- "\nthe first line meets both the [start condition] and the [end\n" +- "condition], the block will contain just that line.\n\n" +- "1. " +- "**Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:**" - " line contains an end tag\n`
    `, ``" -- ", ``, or `` (case-insensitive;" -- " it\nneed not match the start tag).\n\n" -- "2. **Start condition:** line begins with the string ``.\n\n" -- "3. **Start condition:** line begins with the string ``.\n\n" -- "4. **Start condition:** line begins with the string ``.\n\n" -- "5. **Start condition:** line begins with the string\n``.\n\n" +- "5. " +- "**Start condition:** line begins with the string\n``.\n\n" - "6. " - "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" +- "`article`, `aside`, `base`, `basefont`, `blockquote`" +- ", `body`,\n`caption`, `center`, `col`, " +- "`colgroup`, `dd`, `details`, `dialog`,\n`dir`" +- ", `div`, `dl`, `dt`, `fieldset`, `figcaption`" +- ", `figure`,\n`footer`, `form`, `frame`, " +- "`frameset`,\n`h1`, `h2`, `h3`" +- ", `h4`, `h5`, `h6`, `head`" +- ", `header`, `hr`,\n`html`, `iframe`, `legend`" +- ", `li`, `link`, `main`, `menu`, " +- "`menuitem`,\n`nav`, `noframes`, `ol`, " +- "`optgroup`, `option`, `p`, `param`,\n`search`, " +- "`section`, `summary`, `table`, `tbody`, `td`,\n" +- "`tfoot`, `th`, `thead`, `title`, `tr`, " +- "`track`, `ul`, followed\n" +- "by a space, a tab, the end of the line, the string " +- "`>`, or\nthe string `/>`.\\\n**End condition:**" - " line is followed by a [blank line].\n\n" - "7. " - "**Start condition:** line begins with a complete [open tag]\n" - "(with any [tag name] other than `pre`, `script`,\n" -- "`style`, or `textarea`" -- ") or a complete [closing tag],\n" +- "`style`, or `textarea`) or a complete [closing tag],\n" - "followed by zero or more spaces and tabs, followed by the end of the" - " line.\\\n**End condition:** line is followed by a [blank line].\n\n" -- "HTML blocks continue until they are closed by their appropriate\n" -- "[end condition], or the last line of the document or other " +- "HTML blocks continue until they are closed by their appropriate\n[end condition]" +- ", or the last line of the document or other " - "[container\nblock](#container-blocks). This means any HTML " - "**within an HTML\nblock**" - " that might otherwise be recognised as a start condition will\n" - "be ignored by the parser and passed through as-is, without changing\nthe parser" - "'s state.\n\n" -- "For instance, `
    ` within an HTML block started by `` will"
    -- " not affect\n"
    +- "For instance, `
    ` within an HTML block started by `
    `" +- " will not affect\n" - the parser state; as the HTML block was started in by start condition 6 - ", it\nwill end at any blank line. This can be surprising:\n\n" - "````````````````````````````````" @@ -1509,8 +1513,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n" - "````````````````````````````````\n" - "\n" -- "In this case, the HTML block is terminated by the blank line — the `" -- "**Hello**`\n" +- "In this case, the HTML block is terminated by the blank line — the " +- "`**Hello**`\n" - "text remains verbatim — and regular parsing resumes, with a paragraph,\n" - "emphasised `world` and inline and block HTML following.\n\n" - "All types of [HTML blocks] except type 7 may interrupt\n" @@ -1606,9 +1610,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n``` c\nint x = 33;\n```\n.\n" - "
    \n``` c\nint x = 33;\n```\n" - "````````````````````````````````\n" -- "\n\n" -- "To start an [HTML block] with a tag that is *not* in" -- " the\n" +- "\n\nTo start an [HTML block] with a tag that is *not*" +- " in the\n" - "list of block-level tags in (6), you must put the tag by\n" - "itself on the first line (and it must be complete):\n\n" - "````````````````````````````````" @@ -1645,8 +1648,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````````````````````\n" - "\n\nIn this case, we get a raw HTML block that just includes\n" -- "the ``" -- " tag (because it ends with the following blank\n" +- "the `` tag (because it ends with the following blank\n" - "line). So the contents get interpreted as CommonMark:\n\n" - "````````````````````````````````" - " example\n" @@ -1654,18 +1656,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n
    \n" - "````````````````````````````````\n" - "\n\nFinally, in this case, the `` tags are interpreted\n" -- "as [raw HTML] *inside*" -- " the CommonMark paragraph. (Because\n" -- "the tag is not on a line by itself, we get inline HTML\n" +- "as [raw HTML] *inside* the CommonMark paragraph. (Because" +- "\nthe tag is not on a line by itself, we get inline HTML\n" - "rather than an [HTML block].)\n\n" - "````````````````````````````````" - " example\n" - "*foo*\n.\n" - "

    foo

    \n" - "````````````````````````````````\n" -- "\n\nHTML tags designed to contain literal content\n" -- "(`pre`, `script`, `style`, `textarea`), comments, processing" -- " instructions,\nand declarations are treated somewhat differently.\n" +- "\n\nHTML tags designed to contain literal content\n(`pre`, `script`, " +- "`style`, `textarea`), comments, processing instructions,\n" +- "and declarations are treated somewhat differently.\n" - "Instead of ending at the first blank line, these blocks\n" - "end at the first line containing a corresponding end tag.\n" - "As a result, these blocks can contain blank lines:\n\n" @@ -1793,9 +1794,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n\n
    \n.\n
    \n" - "
    <div>\n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "An HTML block of types 1--6 can interrupt a paragraph, and need" -- " not be\npreceded by a blank line.\n\n" +- "\n\nAn HTML block of types 1--" +- "6 can interrupt a paragraph, and need not be\n" +- "preceded by a blank line.\n\n" - "````````````````````````````````" - " example\n" - "Foo\n
    \nbar\n
    \n.\n

    Foo

    \n" @@ -1818,9 +1819,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nThis rule differs from John Gruber's original Markdown syntax\n" - "specification, which says:\n\n" -- "> The only restrictions are that block-level HTML elements —\n" -- "> e.g. `
    `, ``, `
    `,"
    -- " `

    `" +- "> " +- "The only restrictions are that block-level HTML elements —\n> e.g. " +- "`

    `, `
    `, `
    `, `

    `" - ", etc. — must be separated from\n> " - "surrounding content by blank lines, and the start and end tags of the" - "\n> block should not be indented with spaces or tabs.\n\n" @@ -1828,8 +1829,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "here:\n\n" - "- It requires that an HTML block be preceded by a blank line.\n" - "- It does not allow the start tag to be indented.\n" -- "- It requires a matching end tag, which it also does not allow to\n" -- " be indented.\n\n" +- "- It requires a matching end tag, which it also does not allow to" +- "\n be indented.\n\n" - "Most Markdown implementations (including some of Gruber's own) do not\n" - "respect all of these restrictions.\n\n" - "There is one respect, however, in which Gruber's rule is more liberal" @@ -1855,14 +1856,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nSome Markdown implementations have adopted a convention of\n" - "interpreting content inside tags as text if the open tag has\nthe attribute " -- "`markdown=1`" -- ". The rule given above seems a simpler and\n" +- "`markdown=1`. The rule given above seems a simpler and\n" - "more elegant way of achieving the same expressive power, which is also\n" - "much simpler to parse.\n\n" - "The main potential drawback is that one can no longer paste HTML\n" - "blocks into Markdown documents with 100% reliability. However,\n" -- "*in most cases*" -- " this will work fine, because the blank lines in\n" +- "*in most cases* this will work fine, because the blank lines in\n" - "HTML are usually followed by HTML block tags. For example:\n\n" - "````````````````````````````````" - " example\n" @@ -1871,8 +1870,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n

    \n" - "````````````````````````````````\n" - "\n\nThere are problems, however, if the inner tags are indented\n" -- "*and*" -- " separated by spaces, as then they will be interpreted as\n" +- "*and* separated by spaces, as then they will be interpreted as\n" - "an indented code block:\n\n" - "````````````````````````````````" - " example\n" @@ -1882,16 +1880,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "</td>\n
    \n \n" - "\n" - "````````````````````````````````\n" -- "\n\nFortunately, blank lines are usually not necessary and can be\ndeleted. " -- "The exception is inside `
    `"
    -- " tags, but as described\n[above][HTML blocks]"
    -- ", raw HTML blocks starting with `
    `\n*can* contain blank lines.\n\n"
    +- "\n\nFortunately, blank lines are usually not necessary and can be\n"
    +- "deleted.  The exception is inside `
    ` tags, but as described\n"
    +- "[above][HTML blocks], raw HTML blocks starting with `
    `\n"
    +- "*can* contain blank lines.\n\n"
     - "## Link reference definitions\n\n"
    -- "A [link reference definition](@)\n"
    -- "consists of a [link label], optionally preceded by up to three spaces of"
    -- "\nindentation, followed\nby a colon (`:`"
    -- "), optional spaces or tabs (including up to one\n[line ending]), a ["
    -- "link destination],\noptional spaces or tabs (including up to one\n[line ending]"
    +- "A [link reference definition](@)\nconsists of a [link label]"
    +- ", optionally preceded by up to three spaces of\nindentation, followed\n"
    +- "by a colon (`:`), optional spaces or tabs (including up to one\n"
    +- "[line ending]), a [link destination],\n"
    +- "optional spaces or tabs (including up to one\n[line ending]"
     - "), and an optional [link\ntitle]"
     - ", which if it is present must be separated\nfrom the [link destination]"
     - " by spaces or tabs.\nNo further character may occur.\n\n"
    @@ -2107,8 +2105,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "\n\n"
     - "## Paragraphs\n\n"
     - "A sequence of non-blank lines that cannot be interpreted as other\n"
    -- "kinds of blocks forms a [paragraph](@)"
    -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
    +- "kinds of blocks forms a [paragraph](@).\n"
    +- "The contents of the paragraph are the result of parsing the\nparagraph'"
     - "s raw content as inlines.  The paragraph's raw content\n"
     - "is formed by concatenating the lines and removing initial and final\n"
     - "spaces or tabs.\n\nA simple example with two paragraphs:\n"
    @@ -2183,14 +2181,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - " are meta-containers for [list items].\n\n"
     - "We define the syntax for container blocks recursively.  The general\n"
     - "form of the definition is:\n\n"
    -- "> If X is a sequence of blocks, then the result of\n"
    -- "> transforming X in such-and-such a way is a container of type Y"
    +- "> "
    +- "If X is a sequence of blocks, then the result of\n> "
    +- transforming X in such-and-such a way is a container of type Y
     - "\n> with these blocks as its content.\n\n"
     - "So, we explain what counts as a block quote or list item by explaining\n"
    -- how these can be *generated*
    -- " from their contents. This should suffice\n"
    -- "to define the syntax, although it does not give a recipe for *parsing"
    -- "*\nthese constructions.  (A recipe is provided below in the section entitled\n"
    +- "how these can be *generated* from their contents. This should suffice\n"
    +- "to define the syntax, although it does not give a recipe for "
    +- "*parsing*\n"
    +- "these constructions.  (A recipe is provided below in the section entitled\n"
     - "[A parsing strategy](#appendix-a-parsing-strategy).)\n\n"
     - "## Block quotes\n\n"
     - "A [block quote marker](@),\n"
    @@ -2199,27 +2198,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "\nindentation, or (b) a single character `>`"
     - " not followed by a space of\nindentation.\n\n"
     - "The following rules define [block quotes]:\n\n"
    -- 1.  **Basic case.
    -- "**  If a string of lines *Ls*"
    +- "1.  "
    +- "**Basic case.**  If a string of lines *Ls*"
     - " constitute a sequence\n    of blocks *Bs*"
     - ", then the result of prepending a [block quote\n    marker]"
     - " to the beginning of each line in *Ls*\n    is a "
     - "[block quote](#block-quotes) containing *Bs*.\n\n"
    -- 2.  **Laziness.
    -- "**  If a string of lines *Ls* constitute a "
    -- "[block\n    quote](#block-quotes) with contents *Bs*"
    -- ", then the result of deleting\n    the initial [block quote marker]"
    -- " from one or\n    "
    +- "2.  "
    +- "**Laziness.**  If a string of lines *Ls*"
    +- " constitute a [block\n    quote](#block-quotes) with contents "
    +- "*Bs*, then the result of deleting\n    the initial [block quote marker"
    +- "] from one or\n    "
     - more lines in which the next character other than a space or tab after the
     - "\n    [block quote marker] is [paragraph continuation\n    text]"
     - " is a block quote with *Bs* as its content.\n    "
    -- "[Paragraph continuation text](@)"
    -- " is text\n    "
    +- "[Paragraph continuation text](@) is text\n    "
     - "that will be parsed as part of the content of a paragraph, but does"
     - "\n    not occur at the beginning of the paragraph.\n\n"
    -- 3.  **Consecutiveness.
    -- "**  A document cannot contain two [block\n    quotes]"
    -- " in a row unless there is a [blank line] between them.\n\n"
    +- "3.  "
    +- "**Consecutiveness.**  A document cannot contain two [block"
    +- "\n    quotes] in a row unless there is a [blank line]"
    +- " between them.\n\n"
     - "Nothing else counts as a [block quote](#block-quotes).\n"
     - "\nHere is a simple example:\n"
     - "\n"
    @@ -2253,8 +2252,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    > # Foo\n> bar\n"
     - "> baz\n
    \n" - "````````````````````````````````\n" -- "\n\nThe Laziness clause allows us to omit the `>` before\n" -- "[paragraph continuation text]:\n\n" +- "\n\nThe Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:\n\n" - "````````````````````````````````" - " example\n" - "> # Foo\n> bar\nbaz\n.\n
    \n" @@ -2288,9 +2287,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • foo
  • \n\n
    \n
      \n" - "
    • bar
    • \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "For the same reason, we can't omit the `> ` in front of" -- "\nsubsequent lines of an indented or fenced code block:\n\n" +- "\n\nFor the same reason, we can't omit the `> `" +- " in front of\nsubsequent lines of an indented or fenced code block:\n\n" - "````````````````````````````````" - " example\n" - "> foo\n bar\n.\n
    \n" @@ -2313,9 +2311,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nTo see why, note that in\n" - "\n```markdown\n> foo\n> - bar\n```" -- "\n\n" -- "the `- bar` is indented too far to start a list, and " -- "can't\nbe an indented code block because indented code blocks cannot\n" +- "\n\nthe `- bar`" +- " is indented too far to start a list, and can't\n" +- "be an indented code block because indented code blocks cannot\n" - "interrupt paragraphs, so it is [paragraph continuation text].\n\n" - "A block quote can be empty:\n" - "\n" @@ -2340,9 +2338,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n\n> bar\n.\n
    \n

    foo

    \n" - "
    \n
    \n

    bar

    \n
    \n" - "````````````````````````````````\n" -- "\n\n(Most current Markdown implementations, including John Gruber's\n" -- "original `Markdown.pl`" -- ", will parse this example as a single block quote\n" +- "\n\n(Most current Markdown implementations, including John Gruber's\noriginal " +- "`Markdown.pl`, will parse this example as a single block quote\n" - "with two paragraphs. But it seems better to allow the author to decide\n" - "whether two block quotes or one are wanted.)\n\n" - "Consecutiveness means that if we put these block quotes together,\n" @@ -2394,8 +2391,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n

    baz

    \n" - "````````````````````````````````\n" - "\n\nIt is a consequence of the Laziness rule that any number\n" -- "of initial `>`" -- "s may be omitted on a continuation line of a\nnested block quote:\n\n" +- "of initial `>`s may be omitted on a continuation line of a\n" +- "nested block quote:\n\n" - "````````````````````````````````" - " example\n" - "> > > foo\nbar\n.\n
    \n
    \n" @@ -2411,8 +2408,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\nWhen including an indented code block in a block quote,\n" - "remember that the [block quote marker] includes\nboth the `>`" -- " and a following space of indentation. So *five spaces*" -- " are needed\nafter the `>`:\n\n" +- " and a following space of indentation. So *five spaces* are needed\n" +- "after the `>`:\n\n" - "````````````````````````````````" - " example\n" - "> code\n\n> not code\n.\n
    \n" @@ -2421,39 +2418,37 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "## List items\n\n" -- "A [list marker](@) is a\n" -- "[bullet list marker] or an [ordered list marker].\n\n" -- "A [bullet list marker](@)\n" -- "is a `-`, `+`, or `*` character.\n\n" -- "An [ordered list marker](@)\n" -- "is a sequence of 1--9 arabic digits (`0-9`)" -- ", followed by either a\n`.` character or a `)`" -- " character. (The reason for the length\n" +- "A [list marker](@) is a\n[bullet list marker]" +- " or an [ordered list marker].\n\n" +- "A [bullet list marker](@)\nis a `-`, `+`" +- ", or `*` character.\n\n" +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`), followed by either a\n`.`" +- " character or a `)` character. (The reason for the length\n" - "limit is that with 10 digits we start seeing integer overflows\n" - "in some browsers.)\n\nThe following rules define [list items]:\n\n" -- 1. **Basic case. -- "** If a sequence of lines *Ls*" +- "1. " +- "**Basic case.** If a sequence of lines *Ls*" - " constitute a sequence of\n blocks *Bs*" - " starting with a character other than a space or tab, and *M* is" - "\n a list marker of width *W* followed by 1 ≤ " -- "*N*" -- " ≤ 4 spaces of indentation,\n then the result of prepending " -- "*M* and the following spaces to the first line\n of " -- "*Ls*, and indenting subsequent lines of *Ls* by *W" -- " + N* spaces, is a\n list item with *Bs*" -- " as its contents. The type of the list item\n " +- "*N* ≤ 4 spaces of indentation,\n " +- then the result of prepending *M* +- " and the following spaces to the first line\n of *Ls*" +- ", and indenting subsequent lines of *Ls* by " +- "*W + N* spaces, is a\n list item with " +- "*Bs* as its contents. The type of the list item\n " - "(bullet or ordered) is determined by the type of its list marker.\n " - "If the list item is ordered, then it is also assigned a start\n " - "number, based on the ordered list marker.\n\n Exceptions:\n\n " -- "1. When the first list item in a [list] interrupts\n" -- " a paragraph---that is, when it starts on a line that would" -- "\n otherwise count as [paragraph continuation text]---then (a)\n " -- "the lines *Ls* must not begin with a blank line, and (" -- "b) if\n " +- "1. When the first list item in a [list] interrupts\n " +- "a paragraph---that is, when it starts on a line that would\n " +- "otherwise count as [paragraph continuation text]---then (a)\n " +- the lines *Ls* +- " must not begin with a blank line, and (b) if\n " - "the list item is ordered, the start number must be 1.\n " -- "2. " -- "If any line is a [thematic break][thematic breaks] then" -- "\n that line is not a list item.\n\n" +- "2. If any line is a [thematic break][thematic breaks]" +- " then\n that line is not a list item.\n\n" - "For example, let *Ls* be the lines\n" - "\n" - "````````````````````````````````" @@ -2463,9 +2458,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    indented code\n
    \n
    \n" - "

    A block quote.

    \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "And let *M* be the marker `1.`, and *N*" -- " = 2. Then rule #1 says\n" +- "\n\nAnd let *M* be the marker `1.`, and " +- "*N* = 2. Then rule #1 says\n" - "that the following is an ordered list item with start number 1,\n" - "and the same contents as *Ls*:\n\n" - "````````````````````````````````" @@ -2510,8 +2504,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    one

    \n

    two

    \n\n" - "\n" - "````````````````````````````````\n" -- "\n\n" -- "It is tempting to think of this in terms of columns: the continuation\n" +- "\n\nIt is tempting to think of this in terms of columns: the continuation" +- "\n" - blocks must be indented at least to the column of the first character other than - "\n" - "a space or tab after the list marker. " @@ -2527,15 +2521,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    one

    \n

    two

    \n\n" - "\n
    \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "Here `two` occurs in the same column as the list marker `1.`,\n" -- "but is actually contained in the list item, because there is\n" +- "\n\nHere `two` occurs in the same column as the list marker " +- "`1.`,\nbut is actually contained in the list item, because there is\n" - "sufficient indentation after the last containing blockquote marker.\n\n" -- "The converse is also possible. " -- "In the following example, the word `two`" -- "\n" -- "occurs far to the right of the initial text of the list item, `" -- "one`, but\n" +- "The converse is also possible. In the following example, the word " +- "`two`\n" +- "occurs far to the right of the initial text of the list item, " +- "`one`, but\n" - "it is not considered part of the list item, because it is not indented" - "\nfar enough past the blockquote marker:\n\n" - "````````````````````````````````" @@ -2544,9 +2536,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n
      \n
    • one
    • \n
    \n" - "

    two

    \n
    \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "Note that at least one space or tab is needed between the list marker and\n" -- "any following content, so these are not list items:\n\n" +- "\n\nNote that at least one space or tab is needed between the list marker and" +- "\nany following content, so these are not list items:\n\n" - "````````````````````````````````" - " example\n" - "-one\n\n2.two\n.\n

    -one

    \n" @@ -2610,16 +2601,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " example\n" - "-1. not ok\n.\n

    -1. not ok

    \n" - "````````````````````````````````\n" -- "\n\n\n2. **Item starting with indented code." -- "** If a sequence of lines *Ls*" -- "\n constitute a sequence of blocks *Bs*" -- " starting with an indented code\n block, and *M*" -- " is a list marker of width *W*" -- " followed by\n one space of indentation, then the result of prepending " -- "*M* and the\n following space to the first line of " -- "*Ls*, and indenting subsequent lines\n of *Ls*" -- " by *W + 1* spaces, is a list item with *Bs" -- "* as its contents.\n " +- "\n\n\n2. **Item starting with indented code.**" +- " If a sequence of lines *Ls*\n " +- constitute a sequence of blocks *Bs* starting with an indented code +- "\n block, and *M* is a list marker of width " +- "*W* followed by\n " +- "one space of indentation, then the result of prepending *M* and the" +- "\n following space to the first line of *Ls*" +- ", and indenting subsequent lines\n of *Ls* by " +- "*W + 1* spaces, is a list item with *Bs*" +- " as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - list item (bullet or ordered) is determined by the type of its list @@ -2644,9 +2635,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    bar\n
    \n\n" - "\n" - "````````````````````````````````\n" -- "\n\n" -- If the *first* block in the list item is an indented code block -- ",\nthen by rule #2, the contents must be preceded by *one*" +- "\n\nIf the *first*" +- " block in the list item is an indented code block,\n" +- "then by rule #2, the contents must be preceded by *one*" - " space of indentation\nafter the list marker:\n\n" - "````````````````````````````````" - " example\n" @@ -2706,17 +2697,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n

    bar

    \n\n" - "\n" - "````````````````````````````````\n" -- "\n\n3. **Item starting with a blank line." -- "** If a sequence of lines *Ls*" -- "\n starting with a single [blank line] constitute a (possibly empty)" -- "\n sequence of blocks *Bs*, and *M*" -- " is a list marker of width *W*" -- ",\n then the result of prepending *M*" -- " to the first line of *Ls*" -- ", and\n preceding subsequent lines of *Ls* by " -- "*W + 1*" -- " spaces of indentation, is a\n list item with *Bs*" -- " as its contents.\n " +- "\n\n3. **Item starting with a blank line.**" +- " If a sequence of lines *Ls*\n starting with a single [" +- "blank line] constitute a (possibly empty)\n sequence of blocks *Bs*" +- ", and *M* is a list marker of width *W*,\n " +- "then the result of prepending *M* to the first line of " +- "*Ls*, and\n preceding subsequent lines of *Ls* by " +- "*W + 1* spaces of indentation, is a\n " +- "list item with *Bs* as its contents.\n " - "If a line is empty, then it need not be indented. " - "The type of the\n " - list item (bullet or ordered) is determined by the type of its list @@ -2741,8 +2729,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n" - "````````````````````````````````\n" - "\n\nA list item can begin with at most one blank line.\n" -- "In the following example, `foo`" -- " is not part of the list\nitem:\n\n" +- "In the following example, `foo` is not part of the list\n" +- "item:\n\n" - "````````````````````````````````" - " example\n" - "-\n\n foo\n.\n
      \n
    • \n
    \n" @@ -2755,9 +2743,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- foo\n-\n- bar\n.\n
      \n
    • foo
    • \n" - "
    • \n
    • bar
    • \n
    \n" - "````````````````````````````````\n" -- "\n\n" -- "It does not matter whether there are spaces or tabs following the [list marker]:\n" -- "\n" +- "\n\nIt does not matter whether there are spaces or tabs following the [list marker" +- "]:\n\n" - "````````````````````````````````" - " example\n" - "- foo\n- \n- bar\n.\n
      \n" @@ -2784,9 +2771,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n*\n\nfoo\n1.\n.\n

      foo\n*

      \n" - "

      foo\n1.

      \n" - "````````````````````````````````\n" -- "\n\n4. **Indentation." -- "** If a sequence of lines *Ls*" -- " constitutes a list item\n " +- "\n\n4. **Indentation.** If a sequence of lines " +- "*Ls* constitutes a list item\n " - "according to rule #1, #2, or #3, then the result" - " of preceding each line\n of *Ls*" - " by up to three spaces of indentation (the same for each line) also" @@ -2833,8 +2819,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " indented code\n\n > A block quote.\n" - "
    \n" - "````````````````````````````````\n" -- "\n\n\n5. **Laziness." -- "** If a string of lines *Ls* constitute a " +- "\n\n\n5. **Laziness.**" +- " If a string of lines *Ls* constitute a " - "[list\n item](#list-items) with contents *Bs*" - ", then the result of deleting\n " - "some or all of the indentation from one or more lines in which the\n " @@ -2876,14 +2862,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "continued here.

    \n\n\n\n" - "\n" - "````````````````````````````````\n" -- "\n\n\n6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items).\n\n" -- "The rules for sublists follow from the general rules\n" -- "[above][List items]. A sublist must be indented the same number" -- "\nof spaces of indentation a paragraph would need to be in order to be included" -- "\nin the list item.\n\n" -- "So, in this case we need two spaces indent:\n" +- "\n\n\n6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items).\n\n" +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number\n" +- "of spaces of indentation a paragraph would need to be in order to be included\n" +- "in the list item.\n\nSo, in this case we need two spaces indent:\n" - "\n" - "````````````````````````````````" - " example\n" @@ -2943,20 +2928,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "### Motivation\n\n" - "John Gruber's Markdown spec says the following about list items:\n\n" - "1. " -- "\"List markers typically start at the left margin, but may be indented\n" -- " by up to three spaces. List markers must be followed by one or more" +- "\"List markers typically start at the left margin, but may be indented" +- "\n " +- by up to three spaces. List markers must be followed by one or more - "\n spaces or a tab.\"\n\n" - "2. " -- "\"To make lists look nice, you can wrap items with hanging indents....\n" -- " But if you don't want to, you don't have to.\"\n\n" -- "3. \"List items may consist of multiple paragraphs. Each subsequent\n" -- " paragraph in a list item must be indented by either 4 spaces or" -- " one\n tab.\"\n\n" -- "4. \"It looks nice if you indent every line of the subsequent paragraphs,\n" -- " but here again, Markdown will allow you to be lazy.\"\n\n" +- "\"To make lists look nice, you can wrap items with hanging indents....\n " +- "But if you don't want to, you don't have to.\"\n\n" +- "3. " +- "\"List items may consist of multiple paragraphs. Each subsequent\n " +- paragraph in a list item must be indented by either 4 spaces or one +- "\n tab.\"\n\n" +- "4. " +- "\"It looks nice if you indent every line of the subsequent paragraphs,\n " +- "but here again, Markdown will allow you to be lazy.\"\n\n" - "5. " -- "\"To put a blockquote within a list item, the blockquote's `>`\n" -- " delimiters need to be indented.\"\n\n" +- "\"To put a blockquote within a list item, the blockquote's `>`" +- "\n delimiters need to be indented.\"\n\n" - "6. " - "\"To put a code block within a list item, the code block needs to be" - "\n indented twice — 8 spaces or two tabs.\"\n\n" @@ -2970,15 +2958,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that a block quote must be indented, but not by how much; however" - ", the\nexample given has four spaces indentation. Although nothing is said\n" - "about other kinds of block-level content, it is certainly reasonable to\n" -- infer that *all* -- " block elements under a list item, including other\n" +- "infer that *all* block elements under a list item, including other\n" - "lists, must be indented four spaces. This principle has been called the" - "\n*four-space rule*.\n\n" - "The four-space rule is clear and principled, and if the reference\n" -- "implementation `Markdown.pl`" -- " had followed it, it probably would have\n" -- "become the standard. However, `Markdown.pl`" -- " allowed paragraphs and\n" +- "implementation `Markdown.pl` had followed it, it probably would have\n" +- "become the standard. However, `Markdown.pl` allowed paragraphs and\n" - "sublists to start with only two spaces indentation, at least on the\n" - "outer level. Worse, its behavior was inconsistent: a sublist of an\n" - "outer-level list needed two spaces indentation, but a sublist of this\n" @@ -2993,8 +2978,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "is no way to give a spec for list items that will be guaranteed not\n" - "to break any existing documents. However, the spec given here should\n" - "correctly handle lists formatted with either the four-space rule or\n" -- "the more forgiving `Markdown.pl`" -- " behavior, provided they are laid out\n" +- "the more forgiving `Markdown.pl` behavior, provided they are laid out\n" - "in a way that is natural for a human to read.\n\n" - "The strategy here is to let the width and indentation of the list marker\n" - "determine the indentation necessary for blocks to fall under the list\n" @@ -3010,49 +2994,51 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "unnatural. It is quite unintuitive that\n\n" - "``` markdown\n- foo\n\n bar\n\n - baz\n```" - "\n\nshould be parsed as two lists with an intervening paragraph,\n" -- "\n``` html\n
      \n
    • foo
    • \n
    \n" +- "\n``` html\n" +- "
      \n
    • foo
    • \n
    \n" - "

    bar

    \n
      \n
    • baz
    • \n" - "
    \n```\n\n" - "as the four-space rule demands, rather than a single list,\n" -- "\n``` html\n
      \n
    • \n

      foo

      \n" -- "

      bar

      \n
        \n
      • baz
      • \n" -- "
      \n
    • \n
    \n```\n\n" +- "\n``` html\n" +- "
      \n
    • \n

      foo

      \n

      bar

      \n" +- "
        \n
      • baz
      • \n
      \n
    • \n" +- "
    \n```\n\n" - "The choice of four spaces is arbitrary. " - "It can be learned, but it is\n" - "not likely to be guessed, and it trips up beginners regularly.\n\n" -- "Would it help to adopt a two-space rule? " -- "The problem is that such\n" -- "a rule, together with the rule allowing up to three spaces of indentation for\n" -- "the initial list marker, allows text that is indented *less than* the" -- "\noriginal list marker to be included in the list item. For example,\n" +- Would it help to adopt a two-space rule? The problem is that such +- "\na rule, together with the rule allowing up to three spaces of indentation for" +- "\nthe initial list marker, allows text that is indented *less than*" +- " the\noriginal list marker to be included in the list item. For example,\n" - "`Markdown.pl` parses\n\n" - "``` markdown\n - one\n\n two\n```" - "\n\nas a single list item, with `two` a continuation paragraph:\n" -- "\n``` html\n
      \n
    • \n

      one

      \n" -- "

      two

      \n
    • \n
    \n```\n\n" -- "and similarly\n" +- "\n``` html\n" +- "
      \n
    • \n

      one

      \n

      two

      \n" +- "
    • \n
    \n```\n\nand similarly\n" - "\n``` markdown\n> - one\n>\n> two\n```" - "\n\nas\n" -- "\n``` html\n
    \n
      \n
    • \n" -- "

      one

      \n

      two

      \n
    • \n" -- "
    \n
    \n```\n\nThis is extremely unintuitive.\n" +- "\n``` html\n" +- "
    \n
      \n
    • \n

      one

      \n" +- "

      two

      \n
    • \n
    \n
    \n" +- "```\n\nThis is extremely unintuitive.\n" - "\nRather than requiring a fixed indent from the margin, we could require\n" - "a fixed indent (say, two spaces, or even one space) from the" - " list marker (which\n" - "may itself be indented). This proposal would remove the last anomaly\n" - "discussed. Unlike the spec presented above, it would count the following\n" -- "as a list item with a subparagraph, even though the paragraph `bar`" -- "\nis not indented as far as the first paragraph `foo`:\n\n" +- "as a list item with a subparagraph, even though the paragraph `bar`\n" +- "is not indented as far as the first paragraph `foo`:\n\n" - "``` markdown\n 10. foo\n\n bar \n```" -- "\n\n" -- "Arguably this text does read like a list item with `bar` as a" -- " subparagraph,\n" +- "\n\nArguably this text does read like a list item with `bar`" +- " as a subparagraph,\n" - "which may count in favor of the proposal. " - "However, on this proposal indented\n" - "code would have to be indented six spaces after the list marker. " - "And this\nwould break a lot of existing Markdown, which has the pattern:\n\n" - "``` markdown\n1. foo\n\n indented code\n```" -- "\n\nwhere the code is indented eight spaces. " +- "\n\n" +- "where the code is indented eight spaces. " - "The spec above, by contrast, will\n" - "parse this text as expected, since the code block's indentation is measured\n" - "from the beginning of `foo`.\n\n" @@ -3073,21 +3059,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "if they begin with a [list marker] of the same type.\n" - "Two list markers are of the\n" - "same type if (a) they are bullet list markers using the same character\n" -- "(`-`, `+`, or `*`) or (b) they are" -- " ordered list numbers with the same\ndelimiter (either `.` or `)`).\n\n" +- "(`-`, `+`, or `*`" +- ") or (b) they are ordered list numbers with the same\n" +- "delimiter (either `.` or `)`).\n\n" - "A list is an [ordered list](@)\n" - "if its constituent list items begin with\n[ordered list markers], and a\n" -- "[bullet list](@)" -- " if its constituent list\nitems begin with [bullet list markers].\n\n" -- "The [start number](@)\n" -- "of an [ordered list] is determined by the list number of\n" +- "[bullet list](@) if its constituent list\nitems begin with [" +- "bullet list markers].\n\n" +- "The [start number](@)\nof an [ordered list]" +- " is determined by the list number of\n" - "its initial list item. The numbers of subsequent list items are\n" - "disregarded.\n\n" - "A list is [loose](@) if any of its constituent\n" - "list items are separated by blank lines, or if any of its constituent\n" - "list items directly contain two block-level elements with a blank line\n" -- "between them. Otherwise a list is [tight](@)" -- ".\n(The difference in HTML output is that paragraphs in a loose list are\n" +- "between them. Otherwise a list is [tight](@).\n" +- "(The difference in HTML output is that paragraphs in a loose list are\n" - "wrapped in `

    ` tags, while paragraphs in a tight list are not.)\n\n" - "Changing the bullet or ordered list delimiter starts a new list:\n" - "\n" @@ -3112,38 +3099,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

      \n
    • bar
    • \n
    • baz
    • \n" - "
    \n" - "````````````````````````````````\n" -- "\n" -- "`Markdown.pl` does not allow this, through fear of triggering a list\n" -- "via a numeral in a hard-wrapped line:\n\n" -- "``` markdown\nThe number of windows in my house is\n14. " +- "\n`Markdown.pl` does not allow this, through fear of triggering a list" +- "\nvia a numeral in a hard-wrapped line:\n\n" +- "``` markdown\n" +- "The number of windows in my house is\n14. " - "The number of doors is 6.\n```\n\n" -- "Oddly, though, `Markdown.pl` *does* allow a blockquote" -- " to\ninterrupt a paragraph, even though the same considerations might\napply.\n\n" +- "Oddly, though, `Markdown.pl` *does*" +- " allow a blockquote to\ninterrupt a paragraph, even though the same considerations might" +- "\napply.\n\n" - "In CommonMark, we do allow lists to interrupt paragraphs, for\n" - "two reasons. First, it is natural and not uncommon for people\n" - "to start lists without blank lines:\n\n" -- "``` markdown\nI need to buy\n- new shoes\n- a coat\n" +- "``` markdown\n" +- "I need to buy\n- new shoes\n- a coat\n" - "- a plane ticket\n```\n\nSecond, we are attracted to a\n\n" -- "> [principle of uniformity](@):\n" -- "> if a chunk of text has a certain\n> " +- "> " +- "[principle of uniformity](@):\n> " +- "if a chunk of text has a certain\n> " - "meaning, it will continue to have the same meaning when put into a" - "\n> container block (such as a list item or blockquote).\n\n" -- "(Indeed, the spec for [list items] and [block quotes] " -- "presupposes\nthis principle.) This principle implies that if\n\n" -- "``` markdown\n * I need to buy\n - new shoes\n" +- "(Indeed, the spec for [list items] and [block quotes]" +- " presupposes\nthis principle.) This principle implies that if\n\n" +- "``` markdown\n" +- " * I need to buy\n - new shoes\n" - " - a coat\n - a plane ticket\n```\n\n" - "is a list item containing a paragraph followed by a nested sublist,\n" - "as all Markdown implementations agree it is (though the paragraph\n" -- "may be rendered without `

    `" -- " tags, since the list is \"tight\"),\nthen\n\n" -- "``` markdown\nI need to buy\n- new shoes\n- a coat\n" +- "may be rendered without `

    ` tags, since the list is \"tight\"),\n" +- "then\n\n" +- "``` markdown\n" +- "I need to buy\n- new shoes\n- a coat\n" - "- a plane ticket\n```\n\n" - "by itself should be a paragraph followed by a nested sublist.\n" - "\nSince it is well established Markdown practice to allow lists to\n" - "interrupt paragraphs inside list items, the [principle of\nuniformity]" - " requires us to allow this outside list items as\nwell. (" -- "[reStructuredText](https://docutils.sourceforge.net/rst.html)" -- "\ntakes a different approach, requiring blank lines before lists\n" +- "[reStructuredText](https://docutils.sourceforge.net/rst.html)\n" +- "takes a different approach, requiring blank lines before lists\n" - "even inside other list items.)\n\n" - "In order to solve the problem of unwanted lists in paragraphs with\n" - "hard-wrapped numerals, we allow only lists starting with `1` to" @@ -3202,8 +3194,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n\n\n\n" - "
    code\n
    \n" - "````````````````````````````````\n" -- "\n\nList items need not be indented to the same level. " -- "The following\nlist items will be treated as items at the same list level,\n" +- "\n\nList items need not be indented to the same level. The following" +- "\nlist items will be treated as items at the same list level,\n" - "since none is indented enough to belong to the previous list\nitem:\n\n" - "````````````````````````````````" - " example\n" @@ -3222,9 +3214,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\n" - "````````````````````````````````\n" - "\nNote, however, that list items may not be preceded by more than\n" -- "three spaces of indentation. Here `- e`" -- " is treated as a paragraph continuation\n" -- "line, because it is indented more than three spaces:\n\n" +- "three spaces of indentation. Here `- e` is treated as a paragraph continuation" +- "\nline, because it is indented more than three spaces:\n\n" - "````````````````````````````````" - " example\n" - "- a\n - b\n - c\n - d\n" @@ -3232,9 +3223,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
  • b
  • \n
  • c
  • \n
  • d\n" - "- e
  • \n\n" - "````````````````````````````````\n" -- "\n" -- "And here, `3. c` is treated as in indented code block" -- ",\nbecause it is indented four spaces and preceded by a\nblank line.\n\n" +- "\nAnd here, `3. c`" +- " is treated as in indented code block,\n" +- "because it is indented four spaces and preceded by a\nblank line.\n\n" - "````````````````````````````````" - " example\n" - "1. a\n\n 2. b\n\n 3. c\n" @@ -3360,21 +3351,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`hi`lo`\n.\n" - "

    hilo`

    \n" - "````````````````````````````````\n" -- "\n" -- "`hi` is parsed as code, leaving the backtick at the end as" -- " a literal\nbacktick.\n\n\n\n" +- "\n`hi`" +- " is parsed as code, leaving the backtick at the end as a literal\n" +- "backtick.\n\n\n\n" - "## Code spans\n\n" - "A [backtick string](@)\n" -- "is a string of one or more backtick characters (`` ` ``) that" -- " is neither\npreceded nor followed by a backtick.\n\n" +- "is a string of one or more backtick characters (`` ` ``" +- ") that is neither\npreceded nor followed by a backtick.\n\n" - "A [code span](@) begins with a backtick string and ends with" - "\n" - a backtick string of equal length. The contents of the code span are - "\nthe characters between these two backtick strings, normalized in the\n" - "following ways:\n\n" - "- First, [line endings] are converted to [spaces].\n" -- "- If the resulting string both begins *and* ends with a [space]\n" -- " character, but does not consist entirely of [space]\n " +- "- If the resulting string both begins *and* ends with a [space]" +- "\n character, but does not consist entirely of [space]\n " - "characters, a single [space] character is removed from the\n " - "front and back. This allows you to include code that begins\n " - "or ends with backtick characters, which must be separated by\n " @@ -3440,12 +3431,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`foo bar \nbaz`\n.\n" - "

    foo bar baz

    \n" - "````````````````````````````````\n" -- "\nNote that browsers will typically collapse consecutive spaces\n" -- "when rendering ``" +- "\nNote that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:\n\n " - "code{white-space: pre-wrap;}\n" -- "\n\nNote that backslash escapes do not work in code spans. " -- "All backslashes\nare treated literally:\n\n" +- "\n\nNote that backslash escapes do not work in code spans. All backslashes" +- "\nare treated literally:\n\n" - "````````````````````````````````" - " example\n" - "`foo\\`bar`\n.\n" @@ -3468,8 +3458,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\nCode span backticks have higher precedence than any other inline\n" - "constructs except HTML tags and autolinks. " - "Thus, for example, this is\n" -- "not parsed as emphasized text, since the second `*` is part of a" -- " code\nspan:\n\n" +- "not parsed as emphasized text, since the second `*`" +- " is part of a code\nspan:\n\n" - "````````````````````````````````" - " example\n" - "*foo`*`\n.\n

    *foo*

    \n" @@ -3531,66 +3521,71 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n" - "## Emphasis and strong emphasis\n\n" -- "John Gruber's original [Markdown syntax\n" -- "description](https://daringfireball.net/projects/markdown/syntax#em" -- ") says:\n\n" -- "> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of" +- "John Gruber's original " +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown/syntax#em)" +- " says:\n\n" +- "> " +- "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of" - "\n> emphasis. Text wrapped with one `*` or `_`" - " will be wrapped with an HTML\n> `` tag; double " -- "`*`'s or `_`'s will be wrapped with an HTML" -- " ``\n> tag.\n\n" +- "`*`'s or `_`'" +- "s will be wrapped with an HTML ``\n> tag.\n\n" - "This is enough for most users, but these rules leave much undecided,\n" - "especially when it comes to nested emphasis. The original\n`Markdown.pl`" - " test suite makes it clear that triple `***` and\n`___`" - " delimiters can be used for strong emphasis, and most\n" - "implementations have also allowed the following patterns:\n\n" -- "``` markdown\n***strong emph***\n***strong** in emph*\n" +- "``` markdown\n" +- "***strong emph***\n***strong** in emph*\n" - "***emph* in strong**\n**in strong *emph***\n" - "*in emph **strong***\n```\n\n" - "The following patterns are less widely supported, but the intent\n" - "is clear and they are useful (especially in contexts like bibliography\nentries):\n\n" -- "``` markdown\n*emph *with emph* in it*\n" +- "``` markdown\n" +- "*emph *with emph* in it*\n" - "**strong **with strong** in it**\n```\n\n" -- "Many implementations have also restricted intraword emphasis to\n" -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - " forms, to avoid unwanted emphasis in words containing\n" - "internal underscores. (It is best practice to put these in code\n" - "spans, but users often do not.)\n\n" -- "``` markdown\ninternal emphasis: foo*bar*baz\n" +- "``` markdown\n" +- "internal emphasis: foo*bar*baz\n" - "no emphasis: foo_bar_baz\n```\n\n" - "The rules given below capture all of these patterns, while allowing\n" - "for efficient parsing strategies that do not backtrack.\n\n" - "First, some definitions. A [delimiter run](@) is either\n" -- "a sequence of one or more `*`" -- " characters that is not preceded or\n" -- "followed by a non-backslash-escaped `*` character, or a" -- " sequence\nof one or more `_`" +- "a sequence of one or more `*` characters that is not preceded or\n" +- "followed by a non-backslash-escaped `*`" +- " character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped " - "`_` character.\n\n" -- "A [left-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not followed by [Unicode whitespace" -- "],\nand either (2a) not followed by a [Unicode punctuation character]" -- ", or\n(2b) followed by a [Unicode punctuation character] and" -- "\npreceded by [Unicode whitespace] or a [Unicode punctuation character].\n" +- "A [left-flanking delimiter run](@) is\na [delimiter run" +- "] that is (1) not followed by [Unicode whitespace],\n" +- "and either (2a) not followed by a [Unicode punctuation character], or" +- "\n(2b) followed by a [Unicode punctuation character] and\n" +- "preceded by [Unicode whitespace] or a [Unicode punctuation character].\n" - "For purposes of this definition, the beginning and the end of\n" - "the line count as Unicode whitespace.\n\n" -- "A [right-flanking delimiter run](@) is\n" -- "a [delimiter run] that is (1) not preceded by [Unicode whitespace" -- "],\nand either (2a) not preceded by a [Unicode punctuation character]" -- ", or\n(2b) preceded by a [Unicode punctuation character] and" -- "\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\n" +- "A [right-flanking delimiter run](@) is\na [delimiter run" +- "] that is (1) not preceded by [Unicode whitespace],\n" +- "and either (2a) not preceded by a [Unicode punctuation character], or" +- "\n(2b) preceded by a [Unicode punctuation character] and\n" +- "followed by [Unicode whitespace] or a [Unicode punctuation character].\n" - "For purposes of this definition, the beginning and the end of\n" - "the line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n" -- " - left-flanking but not right-flanking:\n\n ```\n" -- " ***abc\n _abc\n **\"abc\"\n " -- " _\"abc\"\n ```\n\n" -- " - right-flanking but not left-flanking:\n\n ```\n abc***\n" -- " abc_\n \"abc\"**\n \"abc\"_\n ```\n\n" -- " - Both left and right-flanking:\n\n ```\n abc***def\n" -- " \"abc\"_\"def\"\n ```\n\n" -- " - Neither left nor right-flanking:\n\n ```\n abc *** def\n" -- " a _ b\n ```\n\n" -- "(The idea of distinguishing left-flanking and right-flanking\n" +- " - left-flanking but not right-flanking:\n" +- "\n ```\n ***abc\n _abc\n **\"abc\"\n" +- " _\"abc\"\n ```\n\n" +- " - right-flanking but not left-flanking:\n" +- "\n ```\n abc***\n abc_\n \"abc\"**\n " +- "\"abc\"_\n ```\n\n" +- " - Both left and right-flanking:\n" +- "\n ```\n abc***def\n \"abc\"_\"def\"\n" +- " ```\n\n" +- " - Neither left nor right-flanking:\n" +- "\n ```\n abc *** def\n a _ b\n ```" +- "\n\n(The idea of distinguishing left-flanking and right-flanking\n" - "delimiter runs based on the character before and the character\n" - "after comes from Roopesh Chander's\n" - "[vfmd](https://web.archive.org/web/20220608143320" @@ -3600,50 +3595,57 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " and its rules for distinguishing left- and right-flanking runs\n" - "are a bit more complex than the ones given here.)\n\n" - "The following rules define emphasis and strong emphasis:\n\n" -- "1. A single `*` character [can open emphasis](@)\n" -- " iff (if and only if) it is part of a [left-" -- "flanking delimiter run].\n\n" -- "2. A single `_` character [can open emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " +- "1. " +- "A single `*` character [can open emphasis](@)\n " +- "iff (if and only if) it is part of a [" +- "left-flanking delimiter run].\n\n" +- "2. " +- "A single `_` character [can open emphasis] iff\n " +- "it is part of a [left-flanking delimiter run]\n " - "and either (a) not part of a [right-flanking delimiter run]" - "\n or (b) part of a [right-flanking delimiter run]" - "\n preceded by a [Unicode punctuation character].\n\n" -- "3. A single `*` character [can close emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run].\n\n" -- "4. A single `_` character [can close emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " +- "3. " +- "A single `*` character [can close emphasis](@)\n " +- "iff it is part of a [right-flanking delimiter run].\n\n" +- "4. " +- "A single `_` character [can close emphasis] iff\n " +- "it is part of a [right-flanking delimiter run]\n " - "and either (a) not part of a [left-flanking delimiter run]" - "\n or (b) part of a [left-flanking delimiter run]" - "\n followed by a [Unicode punctuation character].\n\n" -- "5. A double `**` [can open strong emphasis](@)\n" -- " iff it is part of a [left-flanking delimiter run].\n\n" -- "6. A double `__` [can open strong emphasis] iff\n" -- " it is part of a [left-flanking delimiter run]\n " +- "5. " +- "A double `**` [can open strong emphasis](@)\n " +- "iff it is part of a [left-flanking delimiter run].\n\n" +- "6. " +- "A double `__` [can open strong emphasis] iff\n " +- "it is part of a [left-flanking delimiter run]\n " - "and either (a) not part of a [right-flanking delimiter run]" - "\n or (b) part of a [right-flanking delimiter run]" - "\n preceded by a [Unicode punctuation character].\n\n" -- "7. A double `**` [can close strong emphasis](@)\n" -- " iff it is part of a [right-flanking delimiter run].\n\n" -- "8. A double `__` [can close strong emphasis] iff\n" -- " it is part of a [right-flanking delimiter run]\n " +- "7. " +- "A double `**` [can close strong emphasis](@)\n " +- "iff it is part of a [right-flanking delimiter run].\n\n" +- "8. " +- "A double `__` [can close strong emphasis] iff\n " +- "it is part of a [right-flanking delimiter run]\n " - "and either (a) not part of a [left-flanking delimiter run]" - "\n or (b) part of a [left-flanking delimiter run]" - "\n followed by a [Unicode punctuation character].\n\n" - "9. " -- "Emphasis begins with a delimiter that [can open emphasis] and ends\n" -- " with a delimiter that [can close emphasis], and that uses the same" -- "\n character (`_` or `*`" -- ") as the opening delimiter. The\n " -- "opening and closing delimiters must belong to separate\n [delimiter runs]" -- ". If one of the delimiters can both\n " +- "Emphasis begins with a delimiter that [can open emphasis] and ends\n " +- "with a delimiter that [can close emphasis], and that uses the same\n " +- "character (`_` or `*`) as the opening delimiter. The" +- "\n opening and closing delimiters must belong to separate\n [delimiter runs" +- "]. If one of the delimiters can both\n " - "open and close emphasis, then the sum of the lengths of the\n " - "delimiter runs containing the opening and closing delimiters\n " - "must not be a multiple of 3 unless both lengths are\n " - "multiples of 3.\n\n" -- "10. Strong emphasis begins with a delimiter that\n" -- " [can open strong emphasis] and ends with a delimiter that\n [" -- "can close strong emphasis], and that uses the same character\n (`_`" -- " or `*`" +- "10. " +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis]" +- " and ends with a delimiter that\n [can close strong emphasis]" +- ", and that uses the same character\n (`_` or `*`" - ") as the opening delimiter. The\n " - "opening and closing delimiters must belong to separate\n [delimiter runs]" - ". If one of the delimiters can both open\n " @@ -3652,40 +3654,44 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "delimiters must not be a multiple of 3 unless both lengths\n " - "are multiples of 3.\n\n" - "11. " -- "A literal `*` character cannot occur at the beginning or end of\n" -- " `*`-delimited emphasis or `**`-delimited strong" -- " emphasis, unless it\n is backslash-escaped.\n\n" -- "12. A literal `_` character cannot occur at the beginning or end of\n" -- " `_`-delimited emphasis or `__`-delimited strong emphasis" -- ", unless it\n is backslash-escaped.\n\n" +- "A literal `*` character cannot occur at the beginning or end of\n " +- "`*`-delimited emphasis or `**`" +- "-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" +- "12. " +- "A literal `_` character cannot occur at the beginning or end of\n " +- "`_`-delimited emphasis or `__`" +- "-delimited strong emphasis, unless it\n is backslash-escaped.\n\n" - "Where rules 1--12 above are compatible with multiple parsings,\n" - "the following principles resolve ambiguity:\n\n" -- "13. The number of nestings should be minimized. Thus, for example,\n" -- " an interpretation `...` is always preferred to\n " +- "13. " +- "The number of nestings should be minimized. Thus, for example,\n " +- "an interpretation `...` is always preferred to\n " - "`...`.\n\n" - "14. " -- "An interpretation `...` is always\n" -- " preferred to `...`.\n\n" -- "15. When two potential emphasis or strong emphasis spans overlap,\n" -- " so that the second begins before the first ends and ends after\n " +- "An interpretation `...` is always\n " +- "preferred to `...`.\n\n" +- "15. " +- "When two potential emphasis or strong emphasis spans overlap,\n " +- "so that the second begins before the first ends and ends after\n " - "the first ends, the first takes precedence. Thus, for example,\n " -- "`*foo _bar* baz_` is parsed as `foo" -- " _bar baz_` rather\n than " -- "`*foo bar* baz`.\n\n" -- "16. When there are two potential emphasis or strong emphasis spans\n" -- " with the same closing delimiter, the shorter one (the one that\n " +- "`*foo _bar* baz_` is parsed as " +- "`foo _bar baz_` rather\n " +- "than `*foo bar* baz`.\n\n" +- "16. " +- "When there are two potential emphasis or strong emphasis spans\n " +- "with the same closing delimiter, the shorter one (the one that\n " - "opens later) takes precedence. Thus, for example,\n " -- "`**foo **bar baz**` is parsed as `**foo bar baz`\n rather than " +- "`**foo **bar baz**` is parsed as " +- "`**foo bar baz`\n rather than " - "`foo **bar baz`.\n\n" - "17. " -- "Inline code spans, links, images, and HTML tags group more tightly\n" -- " than emphasis. So, when there is a choice between an interpretation" -- "\n that contains one of these elements and one that does not, the" -- "\n former always wins. Thus, for example, " +- "Inline code spans, links, images, and HTML tags group more tightly\n " +- "than emphasis. So, when there is a choice between an interpretation\n " +- "that contains one of these elements and one that does not, the\n " +- "former always wins. Thus, for example, " - "`*[foo*](bar)` is\n parsed as " -- "`*
    foo*`" -- " rather than as\n `[foo](bar)`.\n\n" +- "`*foo*` rather than as" +- "\n `[foo](bar)`.\n\n" - "These rules can be illustrated through a series of examples.\n\nRule 1:\n" - "\n" - "````````````````````````````````" @@ -3945,9 +3951,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\n**foo bar **\n.\n

    **foo bar **

    \n" - "````````````````````````````````\n" -- "\n\n" -- "(Nor can it be interpreted as an emphasized `*foo bar *`, because" -- " of\nRule 11.)\n\n" +- "\n\n(Nor can it be interpreted as an emphasized `*foo bar *`" +- ", because of\nRule 11.)\n\n" - "This is not strong emphasis, because the second `**` is\n" - "preceded by punctuation and followed by an alphanumeric:\n\n" - "````````````````````````````````" @@ -4090,8 +4095,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobar" - "baz

    \n```\n\n\n" - "is precluded by the condition that a delimiter that\n" -- "can both open and close (like the `*` after `foo`" -- ")\ncannot form emphasis if the sum of the lengths of\n" +- "can both open and close (like the `*` after `foo`)\n" +- "cannot form emphasis if the sum of the lengths of\n" - "the delimiter runs containing the opening and\n" - "closing delimiters is a multiple of 3 unless\n" - "both lengths are multiples of 3.\n\n\n" @@ -4102,9 +4107,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo**bar*\n.\n" - "

    foo**bar

    \n" - "````````````````````````````````\n" -- "\n\nThe same condition ensures that the following\n" -- "cases are all strong emphasis nested inside\nemphasis, even when the interior whitespace is" -- "\nomitted:\n\n\n" +- "\n\nThe same condition ensures that the following\ncases are all strong emphasis nested inside" +- "\nemphasis, even when the interior whitespace is\nomitted:\n\n\n" - "````````````````````````````````" - " example\n" - "***foo** bar*\n.\n" @@ -4122,9 +4126,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo**bar***\n.\n" - "

    foobar

    \n" - "````````````````````````````````\n" -- "\n\nWhen the lengths of the interior closing and opening\n" -- delimiter runs are *both* -- " multiples of 3, though,\nthey can match to create emphasis:\n\n" +- "\n\nWhen the lengths of the interior closing and opening\ndelimiter runs are " +- "*both* multiples of 3, though,\n" +- "they can match to create emphasis:\n\n" - "````````````````````````````````" - " example\n" - "foo***bar***baz\n.\n" @@ -4291,8 +4295,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo **_**\n.\n

    foo _

    \n" - "````````````````````````````````\n" - "\n\nNote that when delimiters do not match evenly, Rule 11 determines\n" -- "that the excess literal `*`" -- " characters will appear outside of the\nemphasis, rather than inside it:\n\n" +- "that the excess literal `*` characters will appear outside of the\n" +- "emphasis, rather than inside it:\n\n" - "````````````````````````````````" - " example\n" - "**foo*\n.\n

    *foo

    \n" @@ -4356,8 +4360,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "__foo_\n.\n

    _foo

    \n" - "````````````````````````````````\n" - "\n\nNote that when delimiters do not match evenly, Rule 12 determines\n" -- "that the excess literal `_`" -- " characters will appear outside of the\nemphasis, rather than inside it:\n\n" +- "that the excess literal `_` characters will appear outside of the\n" +- "emphasis, rather than inside it:\n\n" - "````````````````````````````````" - " example\n" - "_foo__\n.\n

    foo_

    \n" @@ -4532,59 +4536,67 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " the\ndestination and title are given immediately after the link text. In\n" - "[reference links] the destination and title are defined elsewhere in\nthe document.\n\n" - "A [link text](@) consists of a sequence of zero or more\n" -- "inline elements enclosed by square brackets (`[` and `]`" -- "). The\nfollowing rules apply:\n\n" -- "- Links may not contain other links, at any level of nesting. If\n" -- " multiple otherwise valid link definitions appear nested inside each\n " +- "inline elements enclosed by square brackets (`[` and `]`). The\n" +- "following rules apply:\n\n" +- "- " +- "Links may not contain other links, at any level of nesting. If\n " +- "multiple otherwise valid link definitions appear nested inside each\n " - "other, the inner-most definition is used.\n\n" -- "- Brackets are allowed in the [link text] only if (a)" -- " they\n " +- "- " +- "Brackets are allowed in the [link text] only if (a) they" +- "\n " - are backslash-escaped or (b) they appear as a matched pair of - " brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n " - "a close bracket `]`.\n\n" -- "- Backtick [code spans], [autolinks], and raw [HTML" -- " tags] bind more tightly\n " +- "- " +- "Backtick [code spans], [autolinks], and raw [HTML tags" +- "] bind more tightly\n " - "than the brackets in link text. Thus, for example,\n " -- "`` [foo`]` `` could not be a link text, since the second" -- " `]`\n is part of a code span.\n\n" -- "- The brackets in link text bind more tightly than markers for\n" -- " [emphasis and strong emphasis]. Thus, for example, " +- "`` [foo`]` ``" +- " could not be a link text, since the second `]`\n " +- "is part of a code span.\n\n" +- "- " +- "The brackets in link text bind more tightly than markers for\n [" +- "emphasis and strong emphasis]. Thus, for example, " - "`*[foo*](url)` is a link.\n\n" - "A [link destination](@) consists of either\n\n" -- "- a sequence of zero or more characters between an opening `<` and a\n" -- " closing `>` that contains no line endings or unescaped\n `<`" +- "- " +- "a sequence of zero or more characters between an opening `<` and a\n " +- "closing `>` that contains no line endings or unescaped\n `<`" - " or `>` characters, or\n\n" -- "- a nonempty sequence of characters that does not start with `<`,\n" -- " does not include [ASCII control characters][ASCII control character]\n or [" -- "space] character, and includes parentheses only if (a) they are\n " +- "- " +- "a nonempty sequence of characters that does not start with `<`,\n " +- "does not include [ASCII control characters][ASCII control character]\n or [space" +- "] character, and includes parentheses only if (a) they are\n " - backslash-escaped or (b) they are part of a balanced pair of - "\n unescaped parentheses.\n " - "(Implementations may impose limits on parentheses nesting to\n " - "avoid performance issues, but at least three levels of nesting\n " - "should be supported.)\n\nA [link title](@) consists of either\n\n" -- "- a sequence of zero or more characters between straight double-quote\n" -- " characters (`\"`), including a `\"`" -- " character only if it is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between straight single-quote\n" -- " characters (`'`), including a `'`" -- " character only if it is\n backslash-escaped, or\n\n" -- "- a sequence of zero or more characters between matching parentheses\n" -- " (`(...)`), including a `(` or `)` character only if it" -- " is\n backslash-escaped.\n\n" +- "- " +- "a sequence of zero or more characters between straight double-quote\n characters (`\"`" +- "), including a `\"` character only if it is\n " +- "backslash-escaped, or\n\n" +- "- " +- "a sequence of zero or more characters between straight single-quote\n characters (" +- "`'`), including a `'` character only if it is\n " +- "backslash-escaped, or\n\n" +- "- " +- "a sequence of zero or more characters between matching parentheses\n (`(...)`" +- "), including a `(` or `)` character only if it is\n " +- "backslash-escaped.\n\n" - "Although [link titles] may span multiple lines, they may not contain\n" - "a [blank line].\n\n" - "An [inline link](@) consists of a [link text] followed immediately" -- "\nby a left parenthesis `(`" -- ", an optional [link destination], an optional\n[link title]" -- ", and a right parenthesis `)`" -- ".\n" +- "\nby a left parenthesis `(`, an optional [link destination], an optional" +- "\n[link title], and a right parenthesis `)`.\n" - "These four components may be separated by spaces, tabs, and up to one line" - "\nending.\nIf both [link destination] and [link title]" -- " are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending.\n\n" -- "The link's text consists of the inlines contained\n" -- "in the [link text] (excluding the enclosing square brackets).\nThe link'" +- " are present, they *must* be\n" +- "separated by spaces, tabs, and up to one line ending.\n\n" +- "The link's text consists of the inlines contained\nin the [link text" +- "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n`<...>`" - " if present, with backslash-escapes in effect as described\n" - "above. The link's title consists of the link title, excluding its\n" @@ -4799,15 +4811,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    link

    \n" - "````````````````````````````````\n" -- "\n\n" -- "(Note: `Markdown.pl` did allow double quotes inside a double-quoted" -- "\ntitle, and its test suite included a test demonstrating this.\n" +- "\n\n(Note: `Markdown.pl`" +- " did allow double quotes inside a double-quoted\n" +- "title, and its test suite included a test demonstrating this.\n" - "But it is hard to see a good rationale for the extra complexity this\n" - "brings, since there are already many ways---backslash escaping,\n" - "entity and numeric character references, or using a different\n" - "quote type for the enclosing title---to write titles containing\ndouble quotes. " -- "`Markdown.pl`" -- "'s handling of titles has a number\n" +- "`Markdown.pl`'s handling of titles has a number\n" - "of other strange features. For example, it allows single-quoted\n" - "titles in inline links, but not reference links. And, in\n" - "reference links but not inline links, it allows a title to begin\nwith " @@ -4937,13 +4948,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "search=](uri)

    \n" - "````````````````````````````````\n" - "\n\nThere are three kinds of [reference link](@)s:\n" -- "[full](#full-reference-link), [collapsed](#collapsed-reference-link)" -- ",\nand [shortcut](#shortcut-reference-link).\n\n" -- "A [full reference link](@)\n" -- "consists of a [link text] immediately followed by a [link label]\n" -- "that [matches] a [link reference definition] elsewhere in the document.\n\n" -- "A [link label](@) begins with a left bracket (`[`)" -- " and ends\nwith the first right bracket (`]`" +- "[full](#full-reference-link), [collapsed](#collapsed-reference-link),\n" +- "and [shortcut](#shortcut-reference-link).\n\n" +- "A [full reference link](@)\nconsists of a [link text]" +- " immediately followed by a [link label]\nthat [matches] a [" +- "link reference definition] elsewhere in the document.\n\n" +- "A [link label](@) begins with a left bracket (`[`" +- ") and ends\nwith the first right bracket (`]`" - ") that is not backslash-escaped.\n" - "Between these brackets there must be at least one character that is not a space,\n" - "tab, or line ending.\nUnescaped square bracket characters are not allowed inside the" @@ -4952,16 +4963,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One label [matches](@)\n" - "another just in case their normalized forms are equal. To normalize a\n" - "label, strip off the opening and closing brackets,\nperform the " -- "*Unicode case fold*" -- ", strip leading and trailing\n" +- "*Unicode case fold*, strip leading and trailing\n" - "spaces, tabs, and line endings, and collapse consecutive internal\n" - "spaces, tabs, and line endings to a single space. " - "If there are multiple\n" - "matching reference link definitions, the one that comes first in the\n" - "document is used. " - "(It is desirable in such cases to emit a warning.)\n\n" -- "The link's URI and title are provided by the matching [link\n" -- "reference definition].\n\nHere is a simple example:\n" +- "The link's URI and title are provided by the matching [link\nreference definition" +- "].\n\nHere is a simple example:\n" - "\n" - "````````````````````````````````" - " example\n" @@ -4969,8 +4979,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo\n" - "````````````````````````````````\n" -- "\n\nThe rules for the [link text] are the same as with\n" -- "[inline links]. Thus:\n\n" +- "\n\nThe rules for the [link text] are the same as with\n[" +- "inline links]. Thus:\n\n" - "The link text may contain balanced brackets, but not unbalanced ones,\n" - "unless they are escaped:\n\n" - "````````````````````````````````" @@ -5079,9 +5089,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[Foo\n bar]: /url\n\n[Baz][Foo bar]\n.\n" - "

    Baz

    \n" - "````````````````````````````````\n" -- "\n\n" -- "No spaces, tabs, or line endings are allowed between the [link text]" -- " and the\n[link label]:\n\n" +- "\n\nNo spaces, tabs, or line endings are allowed between the [link text" +- "] and the\n[link label]:\n\n" - "````````````````````````````````" - " example\n" - "[foo] [bar]\n\n[bar]: /url \"title\"\n.\n" @@ -5104,11 +5113,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". If whitespace is allowed between the\n" - "link text and the link label, then in the following we will have\n" - "a single reference link, not two shortcut reference links, as\nintended:\n\n" -- "``` markdown\n[foo]\n[bar]\n\n[foo]: /url1\n" +- "``` markdown\n" +- "[foo]\n[bar]\n\n[foo]: /url1\n" - "[bar]: /url2\n```\n\n" - "(Note that [shortcut reference links] were introduced by Gruber\n" -- "himself in a beta version of `Markdown.pl`" -- ", but never included\nin the official syntax description. Without shortcut reference\n" +- "himself in a beta version of `Markdown.pl`, but never included\n" +- "in the official syntax description. Without shortcut reference\n" - "links, it is harmless to allow space between the link text and\n" - "link label; but once shortcut references are introduced, it is\n" - "too dangerous to allow this, as it frequently leads to\n" @@ -5164,9 +5174,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[bar\\\\]: /uri\n\n[bar\\\\]\n.\n" - "

    bar\\

    \n" - "````````````````````````````````\n" -- "\n\n" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:\n\n" +- "\n\nA [link label]" +- " must contain at least one character that is not a space, tab, or\n" +- "line ending:\n\n" - "````````````````````````````````" - " example\n" - "[]\n\n[]: /uri\n.\n

    []

    \n" @@ -5178,11 +5188,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [\n" - "]: /uri

    \n" - "````````````````````````````````\n" -- "\n\nA [collapsed reference link](@)\n" -- "consists of a [link label] that [matches] a\n[" -- "link reference definition] elsewhere in the\ndocument, followed by the string " -- "`[]`" -- ".\nThe contents of the link label are parsed as inlines,\n" +- "\n\nA [collapsed reference link](@)\nconsists of a [link label" +- "] that [matches] a\n[link reference definition] elsewhere in the\n" +- "document, followed by the string `[]`.\n" +- "The contents of the link label are parsed as inlines,\n" - "which are used as the link's text. The link'" - "s URI and title are\n" - "provided by the matching reference link definition. Thus,\n`[foo][]`" @@ -5209,20 +5218,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    Foo\n" - "````````````````````````````````\n" -- "\n\n\n" -- "As with full reference links, spaces, tabs, or line endings are not\n" -- "allowed between the two sets of brackets:\n\n" +- "\n\n\nAs with full reference links, spaces, tabs, or line endings are not" +- "\nallowed between the two sets of brackets:\n\n" - "````````````````````````````````" - " example\n" - "[foo] \n[]\n\n[foo]: /url \"title\"\n.\n" - "

    foo\n" - "[]

    \n" - "````````````````````````````````\n" -- "\n\nA [shortcut reference link](@)\n" -- "consists of a [link label] that [matches] a\n[" -- "link reference definition] elsewhere in the\ndocument and is not followed by " -- "`[]`" -- " or a link label.\n" +- "\n\nA [shortcut reference link](@)\nconsists of a [link label" +- "] that [matches] a\n[link reference definition] elsewhere in the\n" +- "document and is not followed by `[]` or a link label.\n" - "The contents of the link label are parsed as inlines,\n" - "which are used as the link's text. The link's URI and title" - "\nare provided by the matching link reference definition.\nThus, `[foo]`" @@ -5269,17 +5275,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo] bar\n\n[foo]: /url\n.\n" - "

    foo bar

    \n" - "````````````````````````````````\n" -- "\n\n" -- "If you just want bracketed text, you can backslash-escape the\n" -- "opening bracket to avoid links:\n\n" +- "\n\nIf you just want bracketed text, you can backslash-escape the" +- "\nopening bracket to avoid links:\n\n" - "````````````````````````````````" - " example\n" - "\\[foo]\n\n[foo]: /url \"title\"\n.\n" - "

    [foo]

    \n" - "````````````````````````````````\n" -- "\n\n" -- "Note that this is a link, because a link label ends with the first\n" -- "following closing bracket:\n\n" +- "\n\nNote that this is a link, because a link label ends with the first" +- "\nfollowing closing bracket:\n\n" - "````````````````````````````````" - " example\n" - "[foo*]: /url\n\n*[foo*]\n.\n" @@ -5313,17 +5317,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo(not a link)\n" - "````````````````````````````````\n" -- "\n" -- "In the following case `[bar][baz]` is parsed as a reference,\n" -- "`[foo]` as normal text:\n\n" +- "\nIn the following case `[bar][baz]`" +- " is parsed as a reference,\n`[foo]` as normal text:\n\n" - "````````````````````````````````" - " example\n" - "[foo][bar][baz]\n\n[baz]: /url\n.\n" - "

    [foo]bar

    \n" - "````````````````````````````````\n" -- "\n\n" -- "Here, though, `[foo][bar]` is parsed as a reference," -- " since\n`[bar]` is defined:\n\n" +- "\n\nHere, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:\n\n" - "````````````````````````````````" - " example\n" - "[foo][bar][baz]\n\n[baz]: /url1\n" @@ -5331,10 +5333,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobaz

    \n" - "````````````````````````````````\n" -- "\n\n" -- "Here `[foo]` is not parsed as a shortcut reference, because it\n" -- "is followed by a link label (even though `[bar]` is not defined" -- "):\n\n" +- "\n\nHere `[foo]` is not parsed as a shortcut reference, because it" +- "\nis followed by a link label (even though `[bar]`" +- " is not defined):\n\n" - "````````````````````````````````" - " example\n" - "[foo][bar][baz]\n\n[baz]: /url1\n" @@ -5343,13 +5344,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n\n" - "## Images\n\n" -- "Syntax for images is like the syntax for links, with one\ndifference. " -- "Instead of [link text], we have an\n[image description](@)" -- ". The rules for this are the\nsame as for [link text]" -- ", except that (a) an\nimage description starts with `![`" -- " rather than `[`" -- ", and\n(b) an image description may contain links.\n" -- "An image description has inline elements\n" +- "Syntax for images is like the syntax for links, with one\n" +- "difference. Instead of [link text], we have an\n" +- "[image description](@). The rules for this are the\n" +- "same as for [link text], except that (a) an\n" +- "image description starts with `![` rather than `[`, and\n" +- "(b) an image description may contain links.\nAn image description has inline elements\n" - "as its contents. When an image is rendered to HTML,\n" - "this is standardly used as the image's `alt` attribute.\n\n" - "````````````````````````````````" @@ -5381,8 +5381,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n\nThough this spec is concerned with parsing, not rendering, it is\n" - "recommended that in rendering to HTML, only the plain string content\nof the [" - "image description] be used. Note that in\n" -- "the above example, the alt attribute's value is `foo bar`, not `" -- "foo\n[bar](/url)` or " +- "the above example, the alt attribute's value is `foo bar`, not " +- "`foo\n[bar](/url)` or " - "`foo bar`" - ". Only the plain string\ncontent is rendered, without formatting.\n\n" - "````````````````````````````````" @@ -5504,16 +5504,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \"Foo\"\n" - "````````````````````````````````\n" -- "\n\nIf you just want a literal `!" -- "` followed by bracketed text, you can\n" -- "backslash-escape the opening `[`:\n\n" +- "\n\nIf you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening " +- "`[`:\n\n" - "````````````````````````````````" - " example\n" - "!\\[foo]\n\n[foo]: /url \"title\"\n.\n

    ![" - "foo]

    \n" - "````````````````````````````````\n" -- "\n\nIf you want a link after a literal `!" -- "`, backslash-escape the\n`!`:\n\n" +- "\n\nIf you want a link after a literal `!`" +- ", backslash-escape the\n`!`:\n\n" - "````````````````````````````````" - " example\n" - "\\![foo]\n\n[foo]: /url \"title\"\n.\n

    !" @@ -5526,14 +5526,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". They are parsed as links, with the URL or email address\n" - "as the link label.\n\n" - "A [URI autolink](@) consists of `<`, followed by an" -- "\n[absolute URI] followed by `>`" -- ". It is parsed as\n" +- "\n[absolute URI] followed by `>`. It is parsed as\n" - "a link to the URI, with the URI as the link's label.\n\n" -- "An [absolute URI](@),\n" -- "for these purposes, consists of a [scheme] followed by a colon (`:`" -- ")\nfollowed by zero or more characters other than [ASCII control\ncharacters][" -- "ASCII control character], [space], `<`, and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded\n" +- "An [absolute URI](@),\nfor these purposes, consists of a [scheme" +- "] followed by a colon (`:`)\n" +- "followed by zero or more characters other than [ASCII control\ncharacters][" +- "ASCII control character], [space], `<`, and `>`.\n" +- "If the URI includes these characters, they must be percent-encoded\n" - "(e.g. `%20` for a space).\n\n" - "For purposes of this spec, a [scheme](@) is any sequence\n" - "of 2--32 characters beginning with an ASCII letter and followed\n" @@ -5617,16 +5616,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    https://example.com/\\[\\

    \n" - "````````````````````````````````\n" -- "\n\nAn [email autolink](@)\n" -- "consists of `<`, followed by an [email address],\nfollowed by " -- "`>`" +- "\n\nAn [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is " - "`mailto:` followed by the email address.\n\n" - "An [email address](@),\nfor these purposes, is anything that matches\n" - "the " -- "[non-normative regex from the HTML5\n" -- "spec](https://html.spec.whatwg.org/multipage/forms.html#e" -- "-mail-state-(type=email)):\n\n " +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org/multipage/forms.html#e-mail" +- "-state-(type=email)):\n\n " - "/^[a-zA-Z0-9.!#$%&'*+/=?" - "^_`{|}~-]+@[a-zA-Z0-9](?:" - "[a-zA-Z0-9-]{0,61}[a-zA-Z0" @@ -5691,60 +5689,60 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````\n" - "\n\n" - "## Raw HTML\n\n" -- "Text between `<` and `>` that looks like an HTML tag is parsed as" -- " a\nraw HTML tag and will be rendered in HTML without escaping.\n" +- "Text between `<` and `>`" +- " that looks like an HTML tag is parsed as a\n" +- "raw HTML tag and will be rendered in HTML without escaping.\n" - "Tag and attribute names are not limited to current HTML tags,\n" - "so custom tags (and even, say, DocBook tags) may be used" - ".\n\nHere is the grammar for tags:\n" - "\nA [tag name](@) consists of an ASCII letter\n" - "followed by zero or more ASCII letters, digits, or\n" - "hyphens (`-`).\n\n" -- "An [attribute](@) consists of spaces, tabs, and up to one" -- " line ending,\nan [attribute name], and an optional\n[attribute value specification" -- "].\n\n" -- "An [attribute name](@)\n" -- "consists of an ASCII letter, `_`, or `:`, followed by zero" -- " or more ASCII\nletters, digits, `_`, `.`, `:`" -- ", or `-`" +- "An [attribute](@)" +- " consists of spaces, tabs, and up to one line ending,\nan [" +- "attribute name], and an optional\n[attribute value specification].\n\n" +- "An [attribute name](@)\nconsists of an ASCII letter, `_`" +- ", or `:`, followed by zero or more ASCII\n" +- "letters, digits, `_`, `.`, `:`, or `-`" - ". (Note: This is the XML\n" - "specification restricted to ASCII. HTML5 is laxer.)\n\n" - "An [attribute value specification](@)\n" - "consists of optional spaces, tabs, and up to one line ending,\n" -- "a `=` character, optional spaces, tabs, and up to one line ending" -- ",\nand an [attribute value].\n\n" -- "An [attribute value](@)\n" -- "consists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value].\n\n" +- "a `=`" +- " character, optional spaces, tabs, and up to one line ending,\n" +- "and an [attribute value].\n\n" +- "An [attribute value](@)\nconsists of an [unquoted attribute value" +- "],\na [single-quoted attribute value], or a [" +- "double-quoted attribute value].\n\n" - "An [unquoted attribute value](@)\n" - "is a nonempty string of characters not\n" -- "including spaces, tabs, line endings, `\"`, `'`, `=`, `<" -- "`, `>`, or `` ` ``.\n\n" -- "A [single-quoted attribute value](@)\n" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`.\n\n" -- "A [double-quoted attribute value](@)\n" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`.\n\n" +- "including spaces, tabs, line endings, `\"`, `'`, `=`, " +- "`<`, `>`, or `` ` ``.\n\n" +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final `'`.\n\n" +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final `\"`.\n\n" - "An [open tag](@) consists of a `<` character, a [" - "tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional " - "`/` character, and a `>` character.\n\n" - "A [closing tag](@) consists of the string ``.\n\n" -- "An [HTML comment](@) consists of ``, `" -- "`, or ``, and `-->` (see the\n" +- "[tag name]" +- ", optional spaces, tabs, and up to one line ending, and the character" +- "\n`>`.\n\n" +- "An [HTML comment](@) consists of ``, " +- "``, or ``, and `-->` (see the\n" - "[HTML spec](https://html.spec.whatwg.org/multipage/" - "parsing.html#markup-declaration-open-state)).\n\n" -- "A [processing instruction](@)\nconsists of the string ``" -- ", and the string\n`?>`.\n\n" -- "A [declaration](@) consists of the string ``" +- "A [processing instruction](@)\nconsists of the string ``, and the string" +- "\n`?>`.\n\n" +- "A [declaration](@) consists of the string ``" - ", and the character `>`.\n\n" -- "A [CDATA section](@) consists of\nthe string ``" +- "A [CDATA section](@) consists of\nthe string ``" - ", and the string `]]>`.\n\n" - "An [HTML tag](@) consists of an [open tag], a [" - "closing tag],\nan [HTML comment], a [processing instruction], a [declaration" @@ -5896,13 +5894,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Hard line breaks\n\n" - A line ending (not in a code span or HTML tag) that is preceded - "\nby two or more spaces and does not occur at the end of a block" -- "\nis parsed as a [hard line break](@)" -- " (rendered\nin HTML as a `
    ` tag):\n\n" +- "\nis parsed as a [hard line break](@) (rendered\n" +- "in HTML as a `
    ` tag):\n\n" - "````````````````````````````````" - " example\nfoo \nbaz\n.\n

    foo
    \nbaz

    \n" - "````````````````````````````````\n" -- "\n\nFor a more visible alternative, a backslash before the\n" -- "[line ending] may be used instead of two or more spaces:\n\n" +- "\n\nFor a more visible alternative, a backslash before the\n[line ending]" +- " may be used instead of two or more spaces:\n\n" - "````````````````````````````````" - " example\nfoo\\\nbaz\n.\n

    foo
    \nbaz

    \n" - "````````````````````````````````\n" @@ -5997,9 +5995,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - " example\nfoo \n baz\n.\n

    foo\nbaz

    \n" - "````````````````````````````````\n" -- "\n\n" -- "A conforming parser may render a soft line break in HTML either as a\n" -- "line ending or as a space.\n\n" +- "\n\nA conforming parser may render a soft line break in HTML either as a" +- "\nline ending or as a space.\n\n" - "A renderer may also provide an option to render soft line breaks\n" - "as hard line breaks.\n\n" - "## Textual content\n\n" @@ -6025,27 +6022,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "In this appendix we describe some features of the parsing strategy\n" - "used in the CommonMark reference implementations.\n\n" - "## Overview\n\nParsing has two phases:\n\n" -- "1. In the first phase, lines of input are consumed and the block\n" +- "1. " +- "In the first phase, lines of input are consumed and the block\n" - "structure of the document---its division into paragraphs, block quotes,\n" - "list items, and so on---is constructed. Text is assigned to these" - "\nblocks but not parsed. Link reference definitions are parsed and a\n" - "map of links is constructed.\n\n" -- "2. In the second phase, the raw text contents of paragraphs and headings\n" +- "2. " +- "In the second phase, the raw text contents of paragraphs and headings\n" - "are parsed into sequences of Markdown inline elements (strings,\n" - "code spans, links, emphasis, and so on), using the map of link" - "\nreferences constructed in phase 1.\n\n" - "At each point in processing, the document is represented as a tree of\n" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as " -- "**children**" -- ". These children\n" +- "**children**. These children\n" - "may, in turn, have other blocks as children. " - "The last child of a block\nis normally considered **open**" - ", meaning that subsequent lines of input\n" - can alter its contents. (Blocks that are not open are **closed** - ".)\nHere, for example, is a possible document tree, with the open blocks" - "\nmarked by arrows:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-)\n" - " list_item\n paragraph\n" @@ -6053,20 +6052,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " -> list_item\n -> paragraph\n \"aliquando id\"\n" - "```\n\n" - "## Phase 1: block structure\n\n" -- "Each line that is processed has an effect on this tree. " -- "The line is\n" -- "analyzed and, depending on its contents, the document may be altered\n" +- Each line that is processed has an effect on this tree. The line is +- "\nanalyzed and, depending on its contents, the document may be altered\n" - "in one or more of the following ways:\n\n" -- "1. One or more open blocks may be closed.\n2. " -- "One or more new blocks may be created as children of the\n " +- "1. One or more open blocks may be closed.\n" +- "2. One or more new blocks may be created as children of the\n " - "last open block.\n" -- "3. " -- "Text may be added to the last (deepest) open block remaining\n " -- "on the tree.\n\n" +- 3. Text may be added to the last (deepest) open block remaining +- "\n on the tree.\n\n" - "Once a line has been incorporated into the tree in this way,\n" - "it can be discarded, so input can be read in a stream.\n\n" - "For each line, we follow this procedure:\n\n" -- "1. First we iterate through the open blocks, starting with the\n" +- "1. " +- "First we iterate through the open blocks, starting with the\n" - "root document, and descending through last children down to the last\n" - "open block. Each block imposes a condition that the line must satisfy\n" - "if the block is to remain open. " @@ -6075,15 +6073,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "In this phase we may match all or just some of the open\n" - "blocks. But we cannot close unmatched blocks yet, because we may have a" - "\n[lazy continuation line].\n\n" -- "2. Next, after consuming the continuation markers for existing\n" -- "blocks, we look for new block starts (e.g. `>` for a" -- " block quote).\nIf we encounter a new block start, we close any blocks unmatched" -- "\nin step 1 before creating the new block as a child of the last" -- "\nmatched container block.\n\n" +- "2. " +- "Next, after consuming the continuation markers for existing\n" +- "blocks, we look for new block starts (e.g. `>`" +- " for a block quote).\n" +- "If we encounter a new block start, we close any blocks unmatched\n" +- "in step 1 before creating the new block as a child of the last\n" +- "matched container block.\n\n" - "3. " - "Finally, we look at the remainder of the line (after block\n" -- "markers like `>`" -- ", list markers, and indentation have been consumed).\n" +- "markers like `>`, list markers, and indentation have been consumed).\n" - "This is text that can be incorporated into the last open\n" - "block (a paragraph, code block, heading, or raw HTML).\n\n" - "Setext headings are formed when we see a line of a paragraph\n" @@ -6093,46 +6092,49 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one or more reference link definitions. Any remainder becomes a\nnormal paragraph.\n\n" - "We can see how this works by considering how the tree above is\n" - "generated by four lines of Markdown:\n\n" -- "``` markdown\n> Lorem ipsum dolor\nsit amet.\n" +- "``` markdown\n" +- "> Lorem ipsum dolor\nsit amet.\n" - "> - Qui *quodsi iracundia*\n" - "> - aliquando id\n```\n\n" - "At the outset, our document model is just\n" - "\n``` tree\n-> document\n```\n\nThe first line of our text,\n" - "\n``` markdown\n> Lorem ipsum dolor\n```" -- "\n\n" -- "causes a `block_quote` block to be created as a child of our" -- "\nopen `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" -- ". Then the text is added to the last open\nblock, the " -- "`paragraph`:\n\n" -- "``` tree\n-> document\n -> block_quote\n -> paragraph\n" +- "\n\ncauses a `block_quote`" +- " block to be created as a child of our\nopen `document`" +- " block, and a `paragraph` block as a child of\nthe " +- "`block_quote`. Then the text is added to the last open\n" +- "block, the `paragraph`:\n\n" +- "``` tree\n" +- "-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\"\n```\n\nThe next line,\n" - "\n``` markdown\nsit amet.\n```" -- "\n\n" -- "is a \"lazy continuation\" of the open `paragraph`, so it gets added" -- "\nto the paragraph's text:\n\n" -- "``` tree\n-> document\n -> block_quote\n -> paragraph\n" +- "\n\nis a \"lazy continuation\" of the open `paragraph`" +- ", so it gets added\nto the paragraph's text:\n\n" +- "``` tree\n" +- "-> document\n -> block_quote\n -> paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n```\n\nThe third line,\n" - "\n``` markdown\n> - Qui *quodsi iracundia*\n" - "```\n\n" -- "causes the `paragraph` block to be closed, and a new `list" -- "` block\nopened as a child of the `block_quote`. A " -- "`list_item` is also\nadded as a child of the `list`" -- ", and a `paragraph` as a child of\nthe `list_item`" +- "causes the `paragraph` block to be closed, and a new " +- "`list` block\nopened as a child of the `block_quote`" +- ". A `list_item` is also\n" +- "added as a child of the `list`, and a `paragraph`" +- " as a child of\nthe `list_item`" - ". The text is then added to the new `paragraph`:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-)\n" - " -> list_item\n -> paragraph\n" - " \"Qui *quodsi iracundia*\"\n```\n\n" - "The fourth line,\n\n``` markdown\n> - aliquando id\n```" -- "\n\n" -- "causes the `list_item` (and its child the `paragraph`) to" -- " be closed,\nand a new `list_item`" -- " opened up as child of the `list`. A `paragraph`" -- "\nis added as a child of the new `list_item`" +- "\n\ncauses the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" +- " opened up as child of the `list`. A `paragraph`\n" +- "is added as a child of the new `list_item`" - ", to contain the text.\nWe thus obtain the final tree:\n\n" -- "``` tree\n-> document\n -> block_quote\n paragraph\n" +- "``` tree\n" +- "-> document\n -> block_quote\n paragraph\n" - " \"Lorem ipsum dolor\\nsit amet.\"\n" - " -> list (type=bullet tight=true bullet_char=-)\n" - " list_item\n paragraph\n" @@ -6145,16 +6147,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "string contents of paragraphs and headings as inlines. At this\n" - "point we have seen all the link reference definitions, so we can\n" - "resolve reference links as we go.\n\n" -- "``` tree\ndocument\n block_quote\n paragraph\n" -- " str \"Lorem ipsum dolor\"\n softbreak\n" -- " str \"sit amet.\"\n" +- "``` tree\n" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"\n" +- " softbreak\n str \"sit amet.\"\n" - " list (type=bullet tight=true bullet_char=-)\n list_item\n" - " paragraph\n str \"Qui \"\n emph\n" - " str \"quodsi iracundia\"\n list_item\n" - " paragraph\n str \"aliquando id\"\n```\n\n" - "Notice how the [line ending] in the first paragraph has\n" -- "been parsed as a `softbreak`, and the asterisks in the first list" -- " item\nhave become an `emph`.\n\n" +- "been parsed as a `softbreak`" +- ", and the asterisks in the first list item\nhave become an " +- "`emph`.\n\n" - "### An algorithm for parsing nested emphasis and links\n\n" - "By far the trickiest part of inline parsing is handling emphasis,\n" - "strong emphasis, links, and images. This is done using the following\n" @@ -6168,11 +6171,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- the type of delimiter (`[`, `![`, `*`, `_`)\n" - "- the number of delimiters,\n" - "- whether the delimiter is \"active\" (all are active to start), and" -- "\n- whether the delimiter is a potential opener, a potential closer,\n" -- " or both (which depends on what sort of characters precede\n " +- "\n- whether the delimiter is a potential opener, a potential closer,\n " +- "or both (which depends on what sort of characters precede\n " - "and follow the delimiters).\n\n" -- "When we hit a `]` character, we call the *look for link" -- " or image*\nprocedure (see below).\n\n" +- "When we hit a `]` character, we call the " +- "*look for link or image*\nprocedure (see below).\n\n" - "When we hit the end of the input, we call the *process emphasis*\n" - "procedure (see below), with `stack_bottom` = NULL.\n\n" - "#### *look for link or image*\n\n" @@ -6180,21 +6183,27 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "through the stack for an opening `[` or `![` delimiter.\n\n" - "- If we don't find one, we return a literal text node `]" - "`.\n\n" -- "- If we do find one, but it's not *active*, we remove" -- " the inactive\n delimiter from the stack, and return a literal text node " -- "`]`.\n\n" -- "- If we find one and it's active, then we parse ahead to see" -- " if\n we have an inline link/image, reference link/image, collapsed reference" +- "- " +- "If we do find one, but it's not *active*" +- ", we remove the inactive\n " +- "delimiter from the stack, and return a literal text node `]`.\n\n" +- "- " +- "If we find one and it's active, then we parse ahead to see if" +- "\n we have an inline link/image, reference link/image, collapsed reference" - "\n link/image, or shortcut reference link/image.\n\n " -- "+ If we don't, then we remove the opening delimiter from the\n" -- " delimiter stack and return a literal text node `]`.\n\n " -- "+ If we do, then\n\n" -- " * We return a link or image node whose children are the inlines\n" -- " after the text node pointed to by the opening delimiter.\n\n " -- "* We run *process emphasis* on these inlines, with the `[`" -- " opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n" -- " * If we have a link (and not an image), we also set" -- " all\n `[` delimiters before the opening delimiter to *inactive*" +- "+ " +- "If we don't, then we remove the opening delimiter from the\n " +- "delimiter stack and return a literal text node `]`.\n\n " +- "+ If we do, then\n\n " +- "* " +- "We return a link or image node whose children are the inlines\n " +- "after the text node pointed to by the opening delimiter.\n\n " +- "* " +- "We run *process emphasis* on these inlines, with the `[` opener" +- "\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n" +- " * " +- "If we have a link (and not an image), we also set all" +- "\n `[` delimiters before the opening delimiter to *inactive*" - ". (This\n will prevent us from getting links within links.)\n\n" - "#### *process emphasis*\n\n" - "Parameter `stack_bottom` sets a lower bound to how far we\n" @@ -6202,42 +6211,49 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\ngo all the way to the bottom. Otherwise, we stop before\n" - "visiting `stack_bottom`.\n\n" - "Let `current_position` point to the element on the [delimiter stack]\n" -- "just above `stack_bottom` (or the first element if `stack_bottom`" -- "\nis NULL).\n\n" -- "We keep track of the `openers_bottom` for each delimiter\n" -- "type (`*`, `_`), indexed to the length of the closing delimiter run" -- "\n(modulo 3) and to whether the closing delimiter can also be an" -- "\nopener. Initialize this to `stack_bottom`.\n\n" +- "just above `stack_bottom` (or the first element if `stack_bottom`\n" +- "is NULL).\n\n" +- "We keep track of the `openers_bottom` for each delimiter\ntype (" +- "`*`, `_`), indexed to the length of the closing delimiter run\n" +- "(modulo 3) and to whether the closing delimiter can also be an\n" +- "opener. Initialize this to `stack_bottom`.\n\n" - "Then we repeat the following until we run out of potential\nclosers:\n\n" -- "- Move `current_position` forward in the delimiter stack (if needed)\n" -- " until we find the first potential closer with delimiter `*` or `_`" -- ".\n (This will be the potential closer closest\n " -- "to the beginning of the input -- the first one in parse order.)\n\n" -- "- Now, look back in the stack (staying above `stack_bottom`" -- " and\n the `openers_bottom`" -- " for this delimiter type) for the\n first matching potential opener (\"matching\"" -- " means same delimiter).\n\n- If one is found:\n\n " -- "+ Figure out whether we have emphasis or strong emphasis:\n" -- " if both closer and opener spans have length >= 2, we have" -- "\n strong, otherwise regular.\n\n " -- "+ Insert an emph or strong emph node accordingly, after\n" -- " the text node corresponding to the opener.\n\n " -- "+ Remove any delimiters between the opener and closer from\n" -- " the delimiter stack.\n\n " -- + Remove 1 (for regular emph) or 2 (for strong emph -- ") delimiters\n " +- "- " +- "Move `current_position` forward in the delimiter stack (if needed)\n " +- "until we find the first potential closer with delimiter `*` or `_`.\n " +- "(This will be the potential closer closest\n to the beginning of the input --" +- " the first one in parse order.)\n\n" +- "- " +- "Now, look back in the stack (staying above `stack_bottom` and" +- "\n the `openers_bottom` for this delimiter type) for the" +- "\n first matching potential opener (\"matching\" means same delimiter).\n\n" +- "- If one is found:\n\n " +- "+ " +- "Figure out whether we have emphasis or strong emphasis:\n " +- "if both closer and opener spans have length >= 2, we have\n " +- "strong, otherwise regular.\n\n " +- "+ " +- "Insert an emph or strong emph node accordingly, after\n " +- "the text node corresponding to the opener.\n\n " +- "+ " +- "Remove any delimiters between the opener and closer from\n the delimiter stack.\n" +- "\n + " +- Remove 1 (for regular emph) or 2 (for strong emph) +- " delimiters\n " - "from the opening and closing text nodes. If they become empty\n " - "as a result, remove them and remove the corresponding element\n " - "of the delimiter stack. If the closing node is removed, reset\n " - "`current_position` to the next element in the stack.\n\n" -- "- If none is found:\n\n" -- " + Set `openers_bottom` to the element before `current_position`.\n" -- " (We know that there are no openers for this kind of closer up" -- " to and\n " +- "- If none is found:\n\n " +- "+ " +- "Set `openers_bottom` to the element before `current_position`.\n " +- (We know that there are no openers for this kind of closer up to +- " and\n " - "including this point, so this puts a lower bound on future searches.)\n\n " -- "+ If the closer at `current_position` is not a potential opener,\n" -- " remove it from the delimiter stack (since we know it can't\n " +- "+ " +- "If the closer at `current_position` is not a potential opener,\n " +- "remove it from the delimiter stack (since we know it can't\n " - "be a closer either).\n\n " - "+ Advance `current_position` to the next element in the stack.\n\n" -- "After we're done, we remove all delimiters above `stack_bottom` from" -- " the\ndelimiter stack.\n" +- "After we're done, we remove all delimiters above `stack_bottom`" +- " from the\ndelimiter stack.\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap index f4b2597..0f785ea 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md-2.snap @@ -6,7 +6,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------\n\n" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" - "# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. " +- "```\n" +- "1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. " - "Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n" - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" @@ -18,10 +19,12 @@ input_file: tests/inputs/markdown/github_flavored.md - "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------\n\n" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "\n\n```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "\n\n```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\n" +- "\n\n```php\n" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\n" - "echo URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------\n\n" - "# Tables\n\n" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n" +- "```\n" +- "Colons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n" - "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n" - "\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------\n\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap index 0d0d1be..6c0e192 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown@github_flavored.md.snap @@ -4,7 +4,8 @@ expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers\n\n" -- "```\n# h1 Heading 8-)\n## h2 Heading\n" +- "```\n" +- "# h1 Heading 8-)\n## h2 Heading\n" - "### h3 Heading\n#### h4 Heading\n##### h5 Heading\n" - "###### h6 Heading\n\n" - "Alternatively, for H1 and H2, an underline-ish style:\n\n" @@ -31,12 +32,13 @@ input_file: tests/inputs/markdown/github_flavored.md - "_underscores_.\n\n" - "Strong emphasis, aka bold, with **asterisks** or __underscores__." - "\n\nCombined emphasis with **asterisks and _underscores_**.\n" -- "\nStrikethrough uses two tildes. ~~Scratch this." -- "~~\n\n**This is bold text**\n\n__This is bold text__\n" -- "\n*This is italic text*\n\n_This is italic text_\n" -- "\n~~Strikethrough~~\n\n------\n\n" +- "\nStrikethrough uses two tildes. " +- "~~Scratch this.~~\n\n**This is bold text**\n" +- "\n__This is bold text__\n\n*This is italic text*\n" +- "\n_This is italic text_\n\n~~Strikethrough~~\n\n------\n\n" - "# Lists\n\n" -- "```\n1. First ordered list item\n2. Another item\n" +- "```\n" +- "1. First ordered list item\n2. Another item\n" - "⋅⋅* Unordered sub-list.\n1. " - "Actual numbers don't matter, just that it's a number\n" - "⋅⋅1. Ordered sub-list\n4. " @@ -84,19 +86,21 @@ input_file: tests/inputs/markdown/github_flavored.md - " GFM line break behaviour, where trailing spaces are not required.)\n\n" - "* Unordered list can use asterisks\n- Or minuses\n" - "+ Or pluses\n\n" -- "1. Make my changes\n 1. Fix bug\n" +- 1. Make my changes +- "\n 1. Fix bug\n" - " 2. Improve formatting\n - Make the headings bigger\n" - "2. Push my commits to GitHub\n3. Open a pull request\n " -- " * Describe my changes\n * Mention all the members of my team\n" -- " * Ask for feedback\n\n" +- " * Describe my changes\n" +- " * Mention all the members of my team\n * Ask for feedback\n\n" - "+ Create a list by starting a line with `+`, `-`, or `" - "*`\n+ Sub-lists are made by indenting 2 spaces:\n " -- "- Marker character change forces new list start:\n" -- " * Ac tristique libero volutpat at\n " +- "- Marker character change forces new list start:" +- "\n * Ac tristique libero volutpat at\n " - "+ Facilisis in pretium nisl aliquet\n " - "- Nulla volutpat aliquam velit\n+ Very easy!\n\n------\n\n" - "# Task lists\n\n" -- "```\n- [x] Finish my changes\n" +- "```\n" +- "- [x] Finish my changes\n" - "- [ ] Push my commits to GitHub\n" - "- [ ] Open a pull request\n" - "- [x] @mentions, #refs, [links](), **" @@ -107,8 +111,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [x] Finish my changes\n" - "- [ ] Push my commits to GitHub\n" - "- [ ] Open a pull request\n" -- "- [x] @mentions, #refs, [links](), **" -- "formatting**, and tags supported\n" +- "- " +- "[x] @mentions, #refs, [links](), **formatting**" +- ", and tags supported\n" - "- [x] list syntax required (any unordered or ordered list supported)\n" - "- [ ] this is a complete item\n" - "- [ ] this is an incomplete item\n\n------\n\n" @@ -121,7 +126,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Let's rename \\*our-new-project\\* to \\*our-old-project" - "\\*.\n\n------\n\n" - "# Links\n\n" -- "```\n[I'm an inline-style link](https://www.google.com)\n\n" +- "```\n" +- "[I'm an inline-style link](https://www.google.com)\n\n" - "[I'm an inline-style link with title](https://www.google.com \"" - "Google's Homepage\")\n\n" - "[I'm a reference-style link][Arbitrary case-insensitive reference text]\n\n" @@ -144,15 +150,15 @@ input_file: tests/inputs/markdown/github_flavored.md - "\n[You can use numbers for reference-style link definitions][1]\n" - "\nOr leave it empty and use the [link text itself].\n" - "\nURLs and URLs in angle brackets will automatically get turned into links.\n" -- "http://www.example.com or " -- " and sometimes\nexample.com (but not on Github, for example).\n\n" +- "http://www.example.com or and sometimes\n" +- "example.com (but not on Github, for example).\n\n" - "Some text to show that the reference links can follow later.\n" - "\n[arbitrary case-insensitive reference text]: https://www.mozilla.org\n" - "[1]: http://slashdot.org\n" - "[link text itself]: http://www.reddit.com\n\n------\n\n" - "# Images\n\n" -- "```\nHere's our logo (hover to see the title text):\n\n" -- "Inline-style:\n![" +- "```\n" +- "Here's our logo (hover to see the title text):\n\nInline-style:\n![" - "alt text](https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:\n" - "![alt text][logo]\n\n" @@ -167,27 +173,31 @@ input_file: tests/inputs/markdown/github_flavored.md - "[id]: https://octodex.github.com/images/dojocat.jpg " - "\"The Dojocat\"\n```\n\n" - "Here's our logo (hover to see the title text):\n" -- "\nInline-style:\n![" -- "alt text](https://github.com/adam-p/markdown-here/raw/master" -- "/src/common/images/icon48.png \"Logo Title Text 1\")\n\n" +- "\nInline-style:\n" +- "![" +- alt text +- "](https://github.com/adam-p/markdown-here/raw/master/src/common" +- "/images/icon48.png \"Logo Title Text 1\")\n\n" - "Reference-style:\n![alt text][logo]\n" - "\n" - "[logo]: https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 2\"\n\n" - "![Minion](https://octodex.github.com/images/minion.png)\n" - "![" -- "Stormtroopocat](https://octodex.github.com/images/" -- "stormtroopocat.jpg \"The Stormtroopocat\")\n\n" +- Stormtroopocat +- "](https://octodex.github.com/images/stormtroopocat.jpg" +- " \"The Stormtroopocat\")\n\n" - "Like links, Images also have a footnote style syntax\n" - "\n![Alt text][id]\n" - "\nWith a reference later in the document defining the URL location:\n" - "\n" - "[id]: https://octodex.github.com/images/dojocat.jpg " - "\"The Dojocat\"\n\n------\n\n" -- "# [Footnotes](https://github.com/markdown-it/markdown-it-" -- "footnote)\n\n" -- "```\nFootnote 1 link[^first].\n\n" -- "Footnote 2 link[^second].\n\n" +- "# " +- "[Footnotes](https://github.com/markdown-it/markdown-it-footnote" +- ")\n\n" +- "```\n" +- "Footnote 1 link[^first].\n\nFootnote 2 link[^second].\n\n" - "Inline footnote^[Text of inline footnote] definition.\n\n" - "Duplicated footnote reference[^second].\n\n" - "[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n" @@ -200,21 +210,24 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Code and Syntax Highlighting\n\n" - "```\nInline `code` has `back-ticks around` it.\n```" - "\n\nInline `code` has `back-ticks around` it.\n" -- "\n```c#\nusing System.IO.Compression;\n\n" -- "#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n" -- " [Obsolete(\"...\")]\n class Program : IInterface\n {\n" +- "\n```c#\n" +- "using System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\n" +- "namespace MyApplication\n{\n [Obsolete(\"...\")]\n" +- " class Program : IInterface\n {\n" - " public static List JustDoIt(int count)\n {\n" - " Console.WriteLine($\"Hello {Name}!\");\n" - " return new List(new int[] { 1, 2," - " 3 })\n }\n }\n}\n```\n\n" -- "```css\n@font-face {\n" +- "```css\n" +- "@font-face {\n" - " font-family: Chunkfive; src: url('Chunkfive.otf');\n" - "}\n\nbody, .usertext {\n" - " color: #F0F0F0; background: #600;\n" - " font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n" - "@media print {\n a[href^=http]::after {\n" - " content: attr(href)\n }\n}\n```\n\n" -- "```javascript\nfunction $initHighlight(block, cls) {\n try {\n" +- "```javascript\n" +- "function $initHighlight(block, cls) {\n try {\n" - " if (cls.search(/\\bno\\-highlight\\b/) != -1)\n" - " return process(block, true, 0x0F) +\n" - " ` class=\"${cls}\"`;\n } catch (e) {\n" @@ -223,8 +236,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "; i++) {\n if (checkCondition(classes[i]) === undefined)\n" - " console.log('undefined');\n }\n}\n\nexport $initHighlight;\n" - "```\n\n" -- "```php\nrequire_once 'Zend/Uri/Http.php';\n\n" -- "namespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\n" +- "```php\n" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\n" +- "interface Factory\n{\n static function _factory();\n}\n\n" - "abstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n" - " public static $st1 = 1;\n" - " const ME = \"Yo\";\n var $list = NULL;\n" @@ -247,7 +261,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "__halt_compiler () ; datahere\ndatahere\ndatahere */\n" - "datahere\n```\n\n------\n\n" - "# Tables\n\n" -- "```\nColons can be used to align columns.\n\n" +- "```\n" +- "Colons can be used to align columns.\n\n" - "| Tables | Are | Cool |\n" - "| ------------- |:-------------:| -----:|\n" - "| col 3 is | right-aligned | $1600 |\n" @@ -274,14 +289,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n| --- | --- |\n" - "| Backtick | ` |\n| Pipe | \\| |\n```\n\n" - "Colons can be used to align columns.\n\n" -- "| Tables | Are | Cool |\n" -- "| ------------- |:-------------:| -----:|\n" +- "| Tables | Are | Cool " +- "|\n| ------------- |:-------------:| -----:|\n" - "| col 3 is | right-aligned | $1600 |\n" - "| col 2 is | centered | $12 |\n" - "| zebra stripes | are neat | $1 |\n" - "\nThere must be at least 3 dashes separating each header cell.\n" -- "The outer pipes (|) are optional, and you don't need to make" -- " the\n" +- "The outer pipes (|) are optional, and you don'" +- "t need to make the\n" - "raw Markdown line up prettily. You can also use inline Markdown.\n\n" - "Markdown | Less | Pretty\n--- | --- | ---\n" - "*Still* | `renders` | **nicely**\n" @@ -294,16 +309,17 @@ input_file: tests/inputs/markdown/github_flavored.md - "| git diff | Show file differences that haven't been staged |\n\n" - "| Command | Description |\n| --- | --- |\n" - "| `git status` | List all *new or modified* files |\n" -- "| `git diff` | Show file differences that **haven't been** staged" -- " |\n\n" -- "| Left-aligned | Center-aligned | Right-aligned |\n" -- "| :--- | :---: | ---: |\n" +- "| `git diff` |" +- " Show file differences that **haven't been** staged |\n\n" +- "| Left-aligned | Center-aligned | Right-aligned " +- "|\n| :--- | :---: | ---: |\n" - "| git status | git status | git status |\n" - "| git diff | git diff | git diff |\n\n" - "| Name | Character |\n| --- | --- |\n" - "| Backtick | ` |\n| Pipe | \\| |\n\n------\n\n" - "# Blockquotes\n\n" -- "```\n> Blockquotes are very handy in email to emulate reply text.\n" +- "```\n" +- "> Blockquotes are very handy in email to emulate reply text.\n" - "> This line is part of the same quote.\n\nQuote break.\n\n" - "> This is a very long line that will still be quoted properly when it wraps" - ". " @@ -313,18 +329,20 @@ input_file: tests/inputs/markdown/github_flavored.md - "> Blockquotes can also be nested...\n" - ">> ...by using additional greater-than signs right next to each other...\n" - "> > > ...or with spaces between arrows.\n```\n\n" -- "> Blockquotes are very handy in email to emulate reply text.\n" -- "> This line is part of the same quote.\n\nQuote break.\n\n" -- "> This is a very long line that will still be quoted properly when it wraps" -- ". Oh boy let'" +- "> " +- "Blockquotes are very handy in email to emulate reply text.\n> " +- "This line is part of the same quote.\n\nQuote break.\n\n" +- "> " +- This is a very long line that will still be quoted properly when it wraps. +- " Oh boy let'" - s keep writing to make sure this is long enough to actually wrap for everyone. - " Oh, you can *put* **Markdown** into a blockquote.\n\n" -- "> Blockquotes can also be nested...\n" -- ">" +- "> Blockquotes can also be nested...\n>" - "> ...by using additional greater-than signs right next to each other...\n" - "> > > ...or with spaces between arrows.\n\n------\n\n" - "# Inline HTML\n\n" -- "```\n
    \n
    Definition list
    \n" +- "```\n" +- "
    \n
    Definition list
    \n" - "
    Is something people use sometimes.
    \n\n" - "
    Markdown in HTML
    \n" - "
    Does *not* work **very** well. " @@ -335,11 +353,13 @@ input_file: tests/inputs/markdown/github_flavored.md - "
    Does *not* work **very** well. " - "Use HTML tags.
    \n
    \n\n------\n\n" - "# Horizontal Rules\n\n" -- "```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n" -- "___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" +- "```\n" +- "Three or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___\n\n" +- "Underscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***\n" +- "\nAsterisks\n\n___\n\nUnderscores\n\n------\n\n" - "# YouTube Videos\n\n" -- "```\n\n" - "\"IMAGE\n\n" -- "\n```\n[![" +- "\n```\n" +- "[![" - "IMAGE ALT TEXT HERE](http://img.youtube.com/vi/" - "YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com" - "/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```\n\n" - "[![" -- "IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e" -- /ef/YouTube_logo_2015.svg/1200px- -- "YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?" +- IMAGE ALT TEXT HERE +- "](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/" +- YouTube_logo_2015.svg/1200px-YouTube_logo_2015 +- ".svg.png)](https://www.youtube.com/watch?" - "v=ciawICBvQoE)\n" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap index dfb8d71..fc5a101 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md-2.snap @@ -10,10 +10,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "The point can be illustrated by comparing a sample of\n[AsciiDoc](https://asciidoc.org/) with\nan equivalent sample of Markdown. Here is a sample of\nAsciiDoc from the AsciiDoc manual:\n\n```\n1. List item one.\n+\nList item one continued with a second paragraph followed by an\nIndented block.\n+\n.................\n$ ls *.sh\n$ mv *.sh ~/tmp\n.................\n+\nList item continued with a third paragraph.\n\n2. List item two continued with an open block.\n+\n--\nThis paragraph is part of the preceding list item.\n\na. This list is nested and does not require explicit item\ncontinuation.\n+\nThis paragraph is part of the preceding list item.\n\nb. List item b.\n\nThis paragraph belongs to item two of the outer list.\n--\n```\n\nAnd here is the equivalent in Markdown:" - "```\n1. List item one.\n\n List item one continued with a second paragraph followed by an\n Indented block.\n\n $ ls *.sh\n $ mv *.sh ~/tmp\n\n List item continued with a third paragraph.\n\n2. List item two continued with an open block.\n\n This paragraph is part of the preceding list item.\n\n 1. This list is nested and does not require explicit item continuation.\n\n This paragraph is part of the preceding list item.\n\n 2. List item b.\n\n This paragraph belongs to item two of the outer list.\n```\n\nThe AsciiDoc version is, arguably, easier to write. You don't need\nto worry about indentation. But the Markdown version is much easier\nto read. The nesting of list items is apparent to the eye in the\nsource, not just in the processed document." - "## Why is a spec needed?\n\nJohn Gruber's [canonical description of Markdown's\nsyntax](https://daringfireball.net/projects/markdown/syntax)\ndoes not specify the syntax unambiguously. Here are some examples of\nquestions it does not answer:" -- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)\n\n2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank" -- " lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```\n\n4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)" -- "5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```\n\n7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```" -- "9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```\n\n11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```" +- "1. How much indentation is needed for a sublist? The spec says that\n continuation paragraphs need to be indented four spaces, but is\n not fully explicit about sublists. It is natural to think that\n they, too, must be indented four spaces, but `Markdown.pl` does\n not require that. This is hardly a \"corner case,\" and divergences\n between implementations on this issue often lead to surprises for\n users in real documents. (See [this comment by John\n Gruber](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/1997).)" +- "2. Is a blank line needed before a block quote or heading?\n Most implementations do not require the blank line. However,\n this can lead to unexpected results in hard-wrapped text, and\n also to ambiguities in parsing (note that some implementations\n put the heading inside the blockquote, while others do not).\n (John Gruber has also spoken [in favor of requiring the blank\n lines](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2146).)\n\n3. Is a blank line needed before an indented code block?\n (`Markdown.pl` requires it, but this is not mentioned in the\n documentation, and some implementations do not require it.)\n\n ``` markdown\n paragraph\n code?\n ```" +- "4. What is the exact rule for determining when list items get\n wrapped in `

    ` tags? Can a list be partially \"loose\" and partially\n \"tight\"? What should we do with a list like this?\n\n ``` markdown\n 1. one\n\n 2. two\n 3. three\n ```\n\n Or this?\n\n ``` markdown\n 1. one\n - a\n\n - b\n 2. two\n ```\n\n (There are some relevant comments by John Gruber\n [here](https://web.archive.org/web/20170611172104/http://article.gmane.org/gmane.text.markdown.general/2554).)\n\n5. Can list markers be indented? Can ordered list markers be right-aligned?\n\n ``` markdown\n 8. item 1\n 9. item 2\n 10. item 2a\n ```\n\n6. Is this one list with a thematic break in its second item,\n or two lists separated by a thematic break?\n\n ``` markdown\n * a\n * * * * *\n * b\n ```" +- "7. When list markers change from numbers to bullets, do we have\n two lists or one? (The Markdown syntax description suggests two,\n but the perl scripts and many other implementations produce one.)\n\n ``` markdown\n 1. fee\n 2. fie\n - foe\n - fum\n ```\n\n8. What are the precedence rules for the markers of inline structure?\n For example, is the following a valid link, or does the code span\n take precedence ?\n\n ``` markdown\n [a backtick (`)](/url) and [another backtick (`)](/url).\n ```\n\n9. What are the precedence rules for markers of emphasis and strong\n emphasis? For example, how should the following be parsed?\n\n ``` markdown\n *foo *bar* baz*\n ```\n\n10. What are the precedence rules between block-level and inline-level\n structure? For example, how should the following be parsed?\n\n ``` markdown\n - `a long code span can contain a hyphen like this\n - and it can screw things up`\n ```" +- "11. Can list items include section headings? (`Markdown.pl` does not\n allow this, but does allow blockquotes to include headings.)\n\n ``` markdown\n - # Heading\n ```\n\n12. Can list items be empty?\n\n ``` markdown\n * a\n *\n * b\n ```\n\n13. Can link references be defined inside block quotes or list items?\n\n ``` markdown\n > Blockquote [foo].\n >\n > [foo]: /url\n ```\n\n14. If there are multiple definitions for the same reference, which takes\n precedence?\n\n ``` markdown\n [foo]: /url1\n [foo]: /url2\n\n [foo][]\n ```" - "In the absence of a spec, early implementers consulted `Markdown.pl`\nto resolve these ambiguities. But `Markdown.pl` was quite buggy, and\ngave manifestly bad results in many cases, so it was not a\nsatisfactory replacement for a spec.\n\nBecause there is no unambiguous spec, implementations have diverged\nconsiderably. As a result, users are often surprised to find that\na document that renders one way on one system (say, a GitHub wiki)\nrenders differently on another (say, converting to docbook using\npandoc). To make matters worse, because nothing in Markdown counts\nas a \"syntax error,\" the divergence often isn't discovered right away." - "## About this document\n\nThis document attempts to specify Markdown syntax unambiguously.\nIt contains many examples with side-by-side Markdown and\nHTML. These are intended to double as conformance tests. An\naccompanying script `spec_tests.py` can be used to run the tests\nagainst any Markdown program:\n\n python test/spec_tests.py --spec spec.txt --program PROGRAM\n\nSince this document describes how Markdown is to be parsed into\nan abstract syntax tree, it would have made sense to use an abstract\nrepresentation of the syntax tree instead of HTML. But HTML is capable\nof representing the structural distinctions we need to make, and the\nchoice of HTML for the tests makes it possible to run the tests against\nan implementation without writing an abstract syntax tree renderer." - "Note that not every feature of the HTML samples is mandated by\nthe spec. For example, the spec says what counts as a link\ndestination, but it doesn't mandate that non-ASCII characters in\nthe URL be percent-encoded. To use the automatic tests,\nimplementers will need to provide a renderer that conforms to\nthe expectations of the spec examples (percent-encoding\nnon-ASCII characters in URLs). But a conforming implementation\ncan use a different renderer and may choose not to\npercent-encode non-ASCII characters in URLs.\n\nThis document is generated from a text file, `spec.txt`, written\nin Markdown with a small extension for the side-by-side tests.\nThe script `tools/makespec.py` can be used to convert `spec.txt` into\nHTML or CommonMark (which can then be converted into other formats).\n\nIn the examples, the `→` character is used to represent tabs." @@ -85,9 +86,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n````;\n````\n.\n

    \n````````````````````````````````\n\n\n[Info strings] for backtick code blocks cannot contain backticks:\n\n```````````````````````````````` example\n``` aa ```\nfoo\n.\n

    aa\nfoo

    \n````````````````````````````````\n\n\n[Info strings] for tilde code blocks can contain backticks and tildes:\n\n```````````````````````````````` example\n~~~ aa ``` ~~~\nfoo\n~~~\n.\n
    foo\n
    \n````````````````````````````````\n\n\nClosing code fences cannot have [info strings]:" - "```````````````````````````````` example\n```\n``` aaa\n```\n.\n
    ``` aaa\n
    \n````````````````````````````````" - "## HTML blocks\n\nAn [HTML block](@) is a group of lines that is treated\nas raw HTML (and will not be escaped in HTML output).\n\nThere are seven kinds of [HTML block], which can be defined by their\nstart and end conditions. The block begins with a line that meets a\n[start condition](@) (after up to three optional spaces of indentation).\nIt ends with the first subsequent line that meets a matching\n[end condition](@), or the last line of the document, or the last line of\nthe [container block](#container-blocks) containing the current HTML\nblock, if no line is encountered that meets the [end condition]. If\nthe first line meets both the [start condition] and the [end\ncondition], the block will contain just that line." -- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``.\n\n6. **Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:** line is followed by a [blank line]." -- "7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." +- "1. **Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:** line contains an end tag\n`
    `, ``, ``, or `` (case-insensitive; it\nneed not match the start tag).\n\n2. **Start condition:** line begins with the string ``.\n\n3. **Start condition:** line begins with the string ``.\n\n4. **Start condition:** line begins with the string ``.\n\n5. **Start condition:** line begins with the string\n``." +- "6." +- "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" +- " line is followed by a [blank line].\n\n7. **Start condition:** line begins with a complete [open tag]\n(with any [tag name] other than `pre`, `script`,\n`style`, or `textarea`) or a complete [closing tag],\nfollowed by zero or more spaces and tabs, followed by the end of the line.\\\n**End condition:** line is followed by a [blank line]." - "HTML blocks continue until they are closed by their appropriate\n[end condition], or the last line of the document or other [container\nblock](#container-blocks). This means any HTML **within an HTML\nblock** that might otherwise be recognised as a start condition will\nbe ignored by the parser and passed through as-is, without changing\nthe parser's state.\n\nFor instance, `
    ` within an HTML block started by `` will not affect\nthe parser state; as the HTML block was started in by start condition 6, it\nwill end at any blank line. This can be surprising:\n\n```````````````````````````````` example\n
    \n
    \n**Hello**,\n\n_world_.\n
    \n
    \n.\n
    \n
    \n**Hello**,\n

    world.\n

    \n
    \n````````````````````````````````" - "In this case, the HTML block is terminated by the blank line — the `**Hello**`\ntext remains verbatim — and regular parsing resumes, with a paragraph,\nemphasised `world` and inline and block HTML following.\n\nAll types of [HTML blocks] except type 7 may interrupt\na paragraph. Blocks of type 7 may not interrupt a paragraph.\n(This restriction is intended to prevent unwanted interpretation\nof long tags inside a wrapped paragraph as starting HTML blocks.)\n\nSome simple examples follow. Here are some basic HTML blocks\nof type 6:\n\n```````````````````````````````` example\n\n \n \n \n
    \n hi\n
    \n\nokay.\n.\n\n \n \n \n
    \n hi\n
    \n

    okay.

    \n````````````````````````````````" - "```````````````````````````````` example\n \n*foo*\n````````````````````````````````\n\n\nHere we have two HTML blocks with a Markdown paragraph between them:\n\n```````````````````````````````` example\n
    \n\n*Markdown*\n\n
    \n.\n
    \n

    Markdown

    \n
    \n````````````````````````````````\n\n\nThe tag on the first line can be partial, as long\nas it is split where there would be whitespace:" @@ -136,8 +138,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "```````````````````````````````` example\n> > > foo\nbar\n.\n
    \n
    \n
    \n

    foo\nbar

    \n
    \n
    \n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n>>> foo\n> bar\n>>baz\n.\n
    \n
    \n
    \n

    foo\nbar\nbaz

    \n
    \n
    \n
    \n````````````````````````````````\n\n\nWhen including an indented code block in a block quote,\nremember that the [block quote marker] includes\nboth the `>` and a following space of indentation. So *five spaces* are needed\nafter the `>`:" - "```````````````````````````````` example\n> code\n\n> not code\n.\n
    \n
    code\n
    \n
    \n
    \n

    not code

    \n
    \n````````````````````````````````" - "## List items\n\nA [list marker](@) is a\n[bullet list marker] or an [ordered list marker].\n\nA [bullet list marker](@)\nis a `-`, `+`, or `*` character.\n\nAn [ordered list marker](@)\nis a sequence of 1--9 arabic digits (`0-9`), followed by either a\n`.` character or a `)` character. (The reason for the length\nlimit is that with 10 digits we start seeing integer overflows\nin some browsers.)\n\nThe following rules define [list items]:" -- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:\n\n 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if" -- " the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." +- "1. **Basic case.** If a sequence of lines *Ls* constitute a sequence of\n blocks *Bs* starting with a character other than a space or tab, and *M* is\n a list marker of width *W* followed by 1 ≤ *N* ≤ 4 spaces of indentation,\n then the result of prepending *M* and the following spaces to the first line\n of *Ls*, and indenting subsequent lines of *Ls* by *W + N* spaces, is a\n list item with *Bs* as its contents. The type of the list item\n (bullet or ordered) is determined by the type of its list marker.\n If the list item is ordered, then it is also assigned a start\n number, based on the ordered list marker.\n\n Exceptions:" +- " 1. When the first list item in a [list] interrupts\n a paragraph---that is, when it starts on a line that would\n otherwise count as [paragraph continuation text]---then (a)\n the lines *Ls* must not begin with a blank line, and (b) if\n the list item is ordered, the start number must be 1.\n 2. If any line is a [thematic break][thematic breaks] then\n that line is not a list item." - "For example, let *Ls* be the lines\n\n```````````````````````````````` example\nA paragraph\nwith two lines.\n\n indented code\n\n> A block quote.\n.\n

    A paragraph\nwith two lines.

    \n
    indented code\n
    \n
    \n

    A block quote.

    \n
    \n````````````````````````````````\n\n\nAnd let *M* be the marker `1.`, and *N* = 2. Then rule #1 says\nthat the following is an ordered list item with start number 1,\nand the same contents as *Ls*:\n\n```````````````````````````````` example\n1. A paragraph\n with two lines.\n\n indented code\n\n > A block quote.\n.\n
      \n
    1. \n

      A paragraph\nwith two lines.

      \n
      indented code\n
      \n
      \n

      A block quote.

      \n
      \n
    2. \n
    \n````````````````````````````````" - "The most important thing to notice is that the position of\nthe text after the list marker determines how much indentation\nis needed in subsequent blocks in the list item. If the list\nmarker takes up two spaces of indentation, and there are three spaces between\nthe list marker and the next character other than a space or tab, then blocks\nmust be indented five spaces in order to fall under the list\nitem.\n\nHere are some examples showing how far content must be indented to be\nput under the list item:\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • one
    • \n
    \n

    two

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n- one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````" - "```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • one
    • \n
    \n
     two\n
    \n````````````````````````````````\n\n\n```````````````````````````````` example\n - one\n\n two\n.\n
      \n
    • \n

      one

      \n

      two

      \n
    • \n
    \n````````````````````````````````\n\n\nIt is tempting to think of this in terms of columns: the continuation\nblocks must be indented at least to the column of the first character other than\na space or tab after the list marker. However, that is not quite right.\nThe spaces of indentation after the list marker determine how much relative\nindentation is needed. Which column this indentation reaches will depend on\nhow the list item is embedded in other constructions, as shown by\nthis example:" @@ -200,25 +202,25 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "First, some definitions. A [delimiter run](@) is either\na sequence of one or more `*` characters that is not preceded or\nfollowed by a non-backslash-escaped `*` character, or a sequence\nof one or more `_` characters that is not preceded or followed by\na non-backslash-escaped `_` character.\n\nA [left-flanking delimiter run](@) is\na [delimiter run] that is (1) not followed by [Unicode whitespace],\nand either (2a) not followed by a [Unicode punctuation character], or\n(2b) followed by a [Unicode punctuation character] and\npreceded by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace." - "A [right-flanking delimiter run](@) is\na [delimiter run] that is (1) not preceded by [Unicode whitespace],\nand either (2a) not preceded by a [Unicode punctuation character], or\n(2b) preceded by a [Unicode punctuation character] and\nfollowed by [Unicode whitespace] or a [Unicode punctuation character].\nFor purposes of this definition, the beginning and the end of\nthe line count as Unicode whitespace.\n\nHere are some examples of delimiter runs.\n\n - left-flanking but not right-flanking:\n\n ```\n ***abc\n _abc\n **\"abc\"\n _\"abc\"\n ```\n\n - right-flanking but not left-flanking:\n\n ```\n abc***\n abc_\n \"abc\"**\n \"abc\"_\n ```\n\n - Both left and right-flanking:\n\n ```\n abc***def\n \"abc\"_\"def\"\n ```\n\n - Neither left nor right-flanking:\n\n ```\n abc *** def\n a _ b\n ```" - "(The idea of distinguishing left-flanking and right-flanking\ndelimiter runs based on the character before and the character\nafter comes from Roopesh Chander's\n[vfmd](https://web.archive.org/web/20220608143320/http://www.vfmd.org/vfmd-spec/specification/#procedure-for-identifying-emphasis-tags).\nvfmd uses the terminology \"emphasis indicator string\" instead of \"delimiter\nrun,\" and its rules for distinguishing left- and right-flanking runs\nare a bit more complex than the ones given here.)\n\nThe following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run].\n\n6. A double `__` [can open strong emphasis] iff" -- " it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." +- "1. A single `*` character [can open emphasis](@)\n iff (if and only if) it is part of a [left-flanking delimiter run].\n\n2. A single `_` character [can open emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n3. A single `*` character [can close emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n4. A single `_` character [can close emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character].\n\n5. A double `**` [can open strong emphasis](@)\n iff it is part of a [left-flanking delimiter run]." +- "6. A double `__` [can open strong emphasis] iff\n it is part of a [left-flanking delimiter run]\n and either (a) not part of a [right-flanking delimiter run]\n or (b) part of a [right-flanking delimiter run]\n preceded by a [Unicode punctuation character].\n\n7. A double `**` [can close strong emphasis](@)\n iff it is part of a [right-flanking delimiter run].\n\n8. A double `__` [can close strong emphasis] iff\n it is part of a [right-flanking delimiter run]\n and either (a) not part of a [left-flanking delimiter run]\n or (b) part of a [left-flanking delimiter run]\n followed by a [Unicode punctuation character]." - "9. Emphasis begins with a delimiter that [can open emphasis] and ends\n with a delimiter that [can close emphasis], and that uses the same\n character (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both\n open and close emphasis, then the sum of the lengths of the\n delimiter runs containing the opening and closing delimiters\n must not be a multiple of 3 unless both lengths are\n multiples of 3.\n\n10. Strong emphasis begins with a delimiter that\n [can open strong emphasis] and ends with a delimiter that\n [can close strong emphasis], and that uses the same character\n (`_` or `*`) as the opening delimiter. The\n opening and closing delimiters must belong to separate\n [delimiter runs]. If one of the delimiters can both open\n and close strong emphasis, then the sum of the lengths of\n the delimiter runs containing the opening and closing\n delimiters must not be a multiple of 3 unless both lengths\n are multiples of 3." - "11. A literal `*` character cannot occur at the beginning or end of\n `*`-delimited emphasis or `**`-delimited strong emphasis, unless it\n is backslash-escaped.\n\n12. A literal `_` character cannot occur at the beginning or end of\n `_`-delimited emphasis or `__`-delimited strong emphasis, unless it\n is backslash-escaped.\n\nWhere rules 1--12 above are compatible with multiple parsings,\nthe following principles resolve ambiguity:" -- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`.\n\n17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis." -- "So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*
    foo*` rather than as\n `[foo](bar)`.\n\nThese rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````" -- "This is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:\n\n```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:" -- "```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````\n\n\nRule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:" -- "```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:" -- "```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:" -- "```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:" -- "```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:" -- "```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````" -- "```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````" -- "This is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:\n\n```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:" -- "```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````" -- "```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````\n\n\nThis is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:" -- "```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith these examples:" -- "```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:" +- "13. The number of nestings should be minimized. Thus, for example,\n an interpretation `...` is always preferred to\n `...`.\n\n14. An interpretation `...` is always\n preferred to `...`.\n\n15. When two potential emphasis or strong emphasis spans overlap,\n so that the second begins before the first ends and ends after\n the first ends, the first takes precedence. Thus, for example,\n `*foo _bar* baz_` is parsed as `foo _bar baz_` rather\n than `*foo bar* baz`.\n\n16. When there are two potential emphasis or strong emphasis spans\n with the same closing delimiter, the shorter one (the one that\n opens later) takes precedence. Thus, for example,\n `**foo **bar baz**` is parsed as `**foo bar baz`\n rather than `foo **bar baz`." +- "17. Inline code spans, links, images, and HTML tags group more tightly\n than emphasis. So, when there is a choice between an interpretation\n that contains one of these elements and one that does not, the\n former always wins. Thus, for example, `*[foo*](bar)` is\n parsed as `*foo*` rather than as\n `[foo](bar)`." +- "These rules can be illustrated through a series of examples.\n\nRule 1:\n\n```````````````````````````````` example\n*foo bar*\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is followed by\nwhitespace, and hence not part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na * foo bar*\n.\n

    a * foo bar*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `*` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na*\"foo\"*\n.\n

    a*"foo"*

    \n````````````````````````````````\n\n\nUnicode nonbreaking spaces count as whitespace, too:" +- "```````````````````````````````` example\n* a *\n.\n

    * a *

    \n````````````````````````````````\n\n\nUnicode symbols count as punctuation, too:\n\n```````````````````````````````` example\n*$*alpha.\n\n*£*bravo.\n\n*€*charlie.\n.\n

    *$*alpha.

    \n

    *£*bravo.

    \n

    *€*charlie.

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is permitted:\n\n```````````````````````````````` example\nfoo*bar*\n.\n

    foobar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5*6*78\n.\n

    5678

    \n````````````````````````````````" +- "Rule 2:\n\n```````````````````````````````` example\n_foo bar_\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is followed by\nwhitespace:\n\n```````````````````````````````` example\n_ foo bar_\n.\n

    _ foo bar_

    \n````````````````````````````````\n\n\nThis is not emphasis, because the opening `_` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na_\"foo\"_\n.\n

    a_"foo"_

    \n````````````````````````````````\n\n\nEmphasis with `_` is not allowed inside words:\n\n```````````````````````````````` example\nfoo_bar_\n.\n

    foo_bar_

    \n````````````````````````````````" +- "```````````````````````````````` example\n5_6_78\n.\n

    5_6_78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням_стремятся_\n.\n

    пристаням_стремятся_

    \n````````````````````````````````\n\n\nHere `_` does not generate emphasis, because the first delimiter run\nis right-flanking and the second left-flanking:\n\n```````````````````````````````` example\naa_\"bb\"_cc\n.\n

    aa_"bb"_cc

    \n````````````````````````````````\n\n\nThis is emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:" +- "```````````````````````````````` example\nfoo-_(bar)_\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\nRule 3:\n\nThis is not emphasis, because the closing delimiter does\nnot match the opening delimiter:\n\n```````````````````````````````` example\n_foo*\n.\n

    _foo*

    \n````````````````````````````````\n\n\nThis is not emphasis, because the closing `*` is preceded by\nwhitespace:\n\n```````````````````````````````` example\n*foo bar *\n.\n

    *foo bar *

    \n````````````````````````````````\n\n\nA line ending also counts as whitespace:\n\n```````````````````````````````` example\n*foo bar\n*\n.\n

    *foo bar\n*

    \n````````````````````````````````" +- "This is not emphasis, because the second `*` is\npreceded by punctuation and followed by an alphanumeric\n(hence it is not part of a [right-flanking delimiter run]:\n\n```````````````````````````````` example\n*(*foo)\n.\n

    *(*foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:\n\n```````````````````````````````` example\n*(*foo*)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis with `*` is allowed:\n\n```````````````````````````````` example\n*foo*bar\n.\n

    foobar

    \n````````````````````````````````\n\n\n\nRule 4:\n\nThis is not emphasis, because the closing `_` is preceded by\nwhitespace:" +- "```````````````````````````````` example\n_foo bar _\n.\n

    _foo bar _

    \n````````````````````````````````\n\n\nThis is not emphasis, because the second `_` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n_(_foo)\n.\n

    _(_foo)

    \n````````````````````````````````\n\n\nThis is emphasis within emphasis:\n\n```````````````````````````````` example\n_(_foo_)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword emphasis is disallowed for `_`:\n\n```````````````````````````````` example\n_foo_bar\n.\n

    _foo_bar

    \n````````````````````````````````" +- "```````````````````````````````` example\n_пристаням_стремятся\n.\n

    _пристаням_стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n_foo_bar_baz_\n.\n

    foo_bar_baz

    \n````````````````````````````````\n\n\nThis is emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n_(bar)_.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 5:\n\n```````````````````````````````` example\n**foo bar**\n.\n

    foo bar

    \n````````````````````````````````" +- "This is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n** foo bar**\n.\n

    ** foo bar**

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `**` is preceded\nby an alphanumeric and followed by punctuation, and hence\nnot part of a [left-flanking delimiter run]:\n\n```````````````````````````````` example\na**\"foo\"**\n.\n

    a**"foo"**

    \n````````````````````````````````\n\n\nIntraword strong emphasis with `**` is permitted:\n\n```````````````````````````````` example\nfoo**bar**\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 6:" +- "```````````````````````````````` example\n__foo bar__\n.\n

    foo bar

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening delimiter is\nfollowed by whitespace:\n\n```````````````````````````````` example\n__ foo bar__\n.\n

    __ foo bar__

    \n````````````````````````````````\n\n\nA line ending counts as whitespace:\n```````````````````````````````` example\n__\nfoo bar__\n.\n

    __\nfoo bar__

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the opening `__` is preceded\nby an alphanumeric and followed by punctuation:\n\n```````````````````````````````` example\na__\"foo\"__\n.\n

    a__"foo"__

    \n````````````````````````````````" +- "Intraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\nfoo__bar__\n.\n

    foo__bar__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n5__6__78\n.\n

    5__6__78

    \n````````````````````````````````\n\n\n```````````````````````````````` example\nпристаням__стремятся__\n.\n

    пристаням__стремятся__

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo, __bar__, baz__\n.\n

    foo, bar, baz

    \n````````````````````````````````" +- "This is strong emphasis, even though the opening delimiter is\nboth left- and right-flanking, because it is preceded by\npunctuation:\n\n```````````````````````````````` example\nfoo-__(bar)__\n.\n

    foo-(bar)

    \n````````````````````````````````\n\n\n\nRule 7:\n\nThis is not strong emphasis, because the closing delimiter is preceded\nby whitespace:\n\n```````````````````````````````` example\n**foo bar **\n.\n

    **foo bar **

    \n````````````````````````````````\n\n\n(Nor can it be interpreted as an emphasized `*foo bar *`, because of\nRule 11.)\n\nThis is not strong emphasis, because the second `**` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n**(**foo)\n.\n

    **(**foo)

    \n````````````````````````````````" +- "The point of this restriction is more easily appreciated\nwith these examples:\n\n```````````````````````````````` example\n*(**foo**)*\n.\n

    (foo)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**Gomphocarpus (*Gomphocarpus physocarpus*, syn.\n*Asclepias physocarpa*)**\n.\n

    Gomphocarpus (Gomphocarpus physocarpus, syn.\nAsclepias physocarpa)

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n**foo \"*bar*\" foo**\n.\n

    foo "bar" foo

    \n````````````````````````````````\n\n\nIntraword emphasis:" - "```````````````````````````````` example\n**foo**bar\n.\n

    foobar

    \n````````````````````````````````\n\n\nRule 8:\n\nThis is not strong emphasis, because the closing delimiter is\npreceded by whitespace:\n\n```````````````````````````````` example\n__foo bar __\n.\n

    __foo bar __

    \n````````````````````````````````\n\n\nThis is not strong emphasis, because the second `__` is\npreceded by punctuation and followed by an alphanumeric:\n\n```````````````````````````````` example\n__(__foo)\n.\n

    __(__foo)

    \n````````````````````````````````\n\n\nThe point of this restriction is more easily appreciated\nwith this example:" - "```````````````````````````````` example\n_(__foo__)_\n.\n

    (foo)

    \n````````````````````````````````\n\n\nIntraword strong emphasis is forbidden with `__`:\n\n```````````````````````````````` example\n__foo__bar\n.\n

    __foo__bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__пристаням__стремятся\n.\n

    __пристаням__стремятся

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n__foo__bar__baz__\n.\n

    foo__bar__baz

    \n````````````````````````````````" - "This is strong emphasis, even though the closing delimiter is\nboth left- and right-flanking, because it is followed by\npunctuation:\n\n```````````````````````````````` example\n__(bar)__.\n.\n

    (bar).

    \n````````````````````````````````\n\n\nRule 9:\n\nAny nonempty sequence of inline elements can be the contents of an\nemphasized span.\n\n```````````````````````````````` example\n*foo [bar](/url)*\n.\n

    foo bar

    \n````````````````````````````````\n\n\n```````````````````````````````` example\n*foo\nbar*\n.\n

    foo\nbar

    \n````````````````````````````````\n\n\nIn particular, emphasis and strong emphasis can be nested\ninside emphasis:" @@ -328,5 +330,6 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "#### *look for link or image*\n\nStarting at the top of the delimiter stack, we look backwards\nthrough the stack for an opening `[` or `![` delimiter." - "- If we don't find one, we return a literal text node `]`.\n\n- If we do find one, but it's not *active*, we remove the inactive\n delimiter from the stack, and return a literal text node `]`.\n\n- If we find one and it's active, then we parse ahead to see if\n we have an inline link/image, reference link/image, collapsed reference\n link/image, or shortcut reference link/image.\n\n + If we don't, then we remove the opening delimiter from the\n delimiter stack and return a literal text node `]`.\n\n + If we do, then\n\n * We return a link or image node whose children are the inlines\n after the text node pointed to by the opening delimiter.\n\n * We run *process emphasis* on these inlines, with the `[` opener\n as `stack_bottom`.\n\n * We remove the opening delimiter.\n\n * If we have a link (and not an image), we also set all\n `[` delimiters before the opening delimiter to *inactive*. (This\n will prevent us from getting links within links.)" - "#### *process emphasis*\n\nParameter `stack_bottom` sets a lower bound to how far we\ndescend in the [delimiter stack]. If it is NULL, we can\ngo all the way to the bottom. Otherwise, we stop before\nvisiting `stack_bottom`.\n\nLet `current_position` point to the element on the [delimiter stack]\njust above `stack_bottom` (or the first element if `stack_bottom`\nis NULL).\n\nWe keep track of the `openers_bottom` for each delimiter\ntype (`*`, `_`), indexed to the length of the closing delimiter run\n(modulo 3) and to whether the closing delimiter can also be an\nopener. Initialize this to `stack_bottom`.\n\nThen we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter).\n\n- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset" -- " `current_position` to the next element in the stack.\n\n- If none is found:\n\n + Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." +- "- Move `current_position` forward in the delimiter stack (if needed)\n until we find the first potential closer with delimiter `*` or `_`.\n (This will be the potential closer closest\n to the beginning of the input -- the first one in parse order.)\n\n- Now, look back in the stack (staying above `stack_bottom` and\n the `openers_bottom` for this delimiter type) for the\n first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:\n\n + Figure out whether we have emphasis or strong emphasis:\n if both closer and opener spans have length >= 2, we have\n strong, otherwise regular.\n\n + Insert an emph or strong emph node accordingly, after\n the text node corresponding to the opener.\n\n + Remove any delimiters between the opener and closer from\n the delimiter stack.\n\n + Remove 1 (for regular emph) or 2 (for strong emph) delimiters\n from the opening and closing text nodes. If they become empty\n as a result, remove them and remove the corresponding element\n of the delimiter stack. If the closing node is removed, reset\n `current_position` to the next element in the stack.\n\n- If none is found:" +- "+ Set `openers_bottom` to the element before `current_position`.\n (We know that there are no openers for this kind of closer up to and\n including this point, so this puts a lower bound on future searches.)\n\n + If the closer at `current_position` is not a potential opener,\n remove it from the delimiter stack (since we know it can't\n be a closer either).\n\n + Advance `current_position` to the next element in the stack.\n\nAfter we're done, we remove all delimiters above `stack_bottom` from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap index ce42f0d..58a3bd8 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@commonmark_spec.md.snap @@ -3,7 +3,8 @@ source: tests/text_splitter_snapshots.rs expression: chunks input_file: tests/inputs/markdown/commonmark_spec.md --- -- "---\ntitle: CommonMark Spec\nauthor: John MacFarlane" +- "---" +- "title: CommonMark Spec\nauthor: John MacFarlane" - "version: '0.31.2'" - "date: '2024-01-28'" - "license: '[CC-BY-SA 4.0](https://creativecommons.org" @@ -16,8 +17,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - help from Aaron Swartz) and released in 2004 in the form of - a - "[syntax description](https://daringfireball.net/projects/markdown/syntax" -- ")\nand a Perl script (`Markdown.pl`" -- ) for converting Markdown to +- ")\nand a Perl script (`Markdown.pl`) for converting Markdown to" - "HTML. In the next decade, dozens of implementations were" - developed in many languages. Some extended the original - "Markdown syntax with conventions for footnotes, tables, and" @@ -29,17 +29,19 @@ input_file: tests/inputs/markdown/commonmark_spec.md - What distinguishes Markdown from many other lightweight markup - "syntaxes, which are often easier to write, is its readability." - "As Gruber writes:" -- "> The overriding design goal for Markdown's formatting syntax is" -- "> to make it as readable as possible. The idea is that a\n>" +- ">" +- "The overriding design goal for Markdown's formatting syntax is\n>" +- "to make it as readable as possible. The idea is that a\n>" - "Markdown-formatted document should be publishable as-is, as\n>" - "plain text, without looking like it's been marked up with tags\n>" - "or formatting instructions.\n> (" - ")" - The point can be illustrated by comparing a sample of -- "[AsciiDoc](https://asciidoc.org/)" -- " with\nan equivalent sample of Markdown. Here is a sample of" +- "[AsciiDoc](https://asciidoc.org/) with" +- an equivalent sample of Markdown. Here is a sample of - "AsciiDoc from the AsciiDoc manual:" -- "```\n1. List item one.\n+" +- "```" +- "1. List item one.\n+" - "List item one continued with a second paragraph followed by an\nIndented block.\n+" - ".................\n$ ls *.sh\n$ mv *.sh ~/tmp\n................." - "+\nList item continued with a third paragraph.\n\n2." @@ -49,7 +51,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This paragraph is part of the preceding list item.\n\nb. List item b." - "This paragraph belongs to item two of the outer list.\n--\n```" - "And here is the equivalent in Markdown:" -- "```\n1. List item one." +- "```" +- 1. List item one. - List item one continued with a second paragraph followed by an - " Indented block.\n\n $ ls *.sh" - $ mv *.sh ~/tmp @@ -60,102 +63,110 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " This paragraph is part of the preceding list item.\n\n 2." - "List item b.\n\n This paragraph belongs to item two of the outer list." - "```" -- "The AsciiDoc version is, arguably, easier to write." -- "You don't need" -- to worry about indentation. But the Markdown version is much easier +- "The AsciiDoc version is, arguably, easier to write. You don'" +- "t need\nto worry about indentation. But the Markdown version is much easier" - to read. The nesting of list items is apparent to the eye in the - "source, not just in the processed document." - "## Why is a spec needed?" -- "John Gruber's [canonical description of Markdown's" -- "syntax](https://daringfireball.net/projects/markdown/syntax)" +- "John Gruber's" +- "[canonical description of Markdown's\nsyntax" +- "](https://daringfireball.net/projects/markdown/syntax)" - does not specify the syntax unambiguously. Here are some examples of - "questions it does not answer:" -- 1. How much indentation is needed for a sublist? -- The spec says that +- "1." +- How much indentation is needed for a sublist? The spec says that - "continuation paragraphs need to be indented four spaces, but is" - not fully explicit about sublists. It is natural to think that - "they, too, must be indented four spaces, but `Markdown.pl`" -- "does\n not require that. This is hardly a \"corner case,\"" +- " does\n not require that. This is hardly a \"corner case,\"" - " and divergences\n between implementations on this issue often lead to surprises for" - users in real documents. (See -- "[this comment by John" -- "Gruber](https://web.archive.org/web/20170611172104/http" -- "://article.gmane.org/gmane.text.markdown.general/1997" -- ).) -- 2. Is a blank line needed before a block quote or heading? +- "[this comment by John\n Gruber" +- "](https://web.archive.org/web/20170611172104/http://" +- article.gmane.org/gmane.text.markdown.general/1997) +- ".)" +- "2." +- Is a blank line needed before a block quote or heading? - "Most implementations do not require the blank line. However," - "this can lead to unexpected results in hard-wrapped text, and" - also to ambiguities in parsing (note that some implementations - "put the heading inside the blockquote, while others do not)." - (John Gruber has also spoken -- "[in favor of requiring the blank" -- "lines](https://web.archive.org/web/20170611172104/http://" -- article.gmane.org/gmane.text.markdown.general/2146). -- ) -- 3. Is a blank line needed before an indented code block? -- "(`Markdown.pl`" -- "requires it, but this is not mentioned in the" +- "[in favor of requiring the blank\n lines" +- "](https://web.archive.org/web/20170611172104/http://" +- article.gmane.org/gmane.text.markdown.general/2146) +- ".)" +- "3." +- "Is a blank line needed before an indented code block?\n (" +- "`Markdown.pl` requires it, but this is not mentioned in the" - "documentation, and some implementations do not require it.)" - "``` markdown\n paragraph\n code?\n ```" -- 4. What is the exact rule for determining when list items get -- "wrapped in `

    `" -- "tags? Can a list be partially \"loose\" and partially" -- "\"tight\"? What should we do with a list like this?" +- "4." +- "What is the exact rule for determining when list items get\n wrapped in" +- "`

    ` tags? Can a list be partially \"loose\"" +- " and partially\n \"tight\"" +- "? What should we do with a list like this?" - "``` markdown\n 1. one\n\n 2. two" -- " 3. three\n ```\n\n Or this?" +- "3. three\n ```\n\n Or this?" - " ``` markdown\n 1. one\n - a" -- " - b\n 2. two\n ```" +- " - b\n 2. two\n ```" - (There are some relevant comments by John Gruber - "[here](https://web.archive.org/web/20170611172104/http" - "://article.gmane.org/gmane.text.markdown.general/2554" - ).) -- 5. Can list markers be indented? -- Can ordered list markers be right-aligned? -- "``` markdown\n 8. item 1" -- " 9. item 2\n 10. item 2a" +- "5." +- Can list markers be indented? Can ordered list markers be right-aligned? +- " ``` markdown\n 8. item 1" +- " 9. item 2\n 10. item 2a" - "```" -- "6. Is this one list with a thematic break in its second item," +- "6." +- "Is this one list with a thematic break in its second item," - or two lists separated by a thematic break? - "``` markdown\n * a\n * * * * *\n * b" - "```" -- "7. When list markers change from numbers to bullets, do we have" +- "7." +- "When list markers change from numbers to bullets, do we have" - "two lists or one? (The Markdown syntax description suggests two," - but the perl scripts and many other implementations produce one.) - "``` markdown\n 1. fee\n 2. fie" -- " - foe\n - fum\n ```" -- 8. What are the precedence rules for the markers of inline structure? +- "- foe\n - fum\n ```" +- "8." +- What are the precedence rules for the markers of inline structure? - "For example, is the following a valid link, or does the code span" - take precedence ? - "``` markdown" - "[a backtick (`)](/url) and [another backtick (`)](/" - "url).\n ```" -- 9. What are the precedence rules for markers of emphasis and strong +- "9." +- What are the precedence rules for markers of emphasis and strong - "emphasis? For example, how should the following be parsed?" - "``` markdown\n *foo *bar* baz*\n ```" -- 10. What are the precedence rules between block-level and inline-level +- "10." +- What are the precedence rules between block-level and inline-level - "structure? For example, how should the following be parsed?" - "``` markdown" - "- `a long code span can contain a hyphen like this" - " - and it can screw things up`\n ```" -- "11. Can list items include section headings? (`Markdown.pl` does not" +- "11." +- "Can list items include section headings? (`Markdown.pl` does not" - "allow this, but does allow blockquotes to include headings.)" - "``` markdown\n - # Heading\n ```" -- "12. Can list items be empty?\n\n ``` markdown\n * a" -- " *\n * b\n ```" +- 12. Can list items be empty? +- " ``` markdown\n * a\n *\n * b\n ```" - 13. Can link references be defined inside block quotes or list items? - " ``` markdown\n > Blockquote [foo].\n >" -- " > [foo]: /url\n ```" -- "14. If there are multiple definitions for the same reference, which takes" -- precedence? -- "``` markdown\n [foo]: /url1" -- " [foo]: /url2\n\n [foo][]\n ```" +- "> [foo]: /url\n ```" +- "14." +- "If there are multiple definitions for the same reference, which takes\n precedence?" +- " ``` markdown\n [foo]: /url1" +- "[foo]: /url2\n\n [foo][]\n ```" - "In the absence of a spec, early implementers consulted `Markdown.pl`" - "to resolve these ambiguities. But `Markdown.pl`" - "was quite buggy, and" - "gave manifestly bad results in many cases, so it was not a" - satisfactory replacement for a spec. -- "Because there is no unambiguous spec, implementations have diverged\nconsiderably." -- "As a result, users are often surprised to find that" +- "Because there is no unambiguous spec, implementations have diverged" +- "considerably. As a result, users are often surprised to find that" - "a document that renders one way on one system (say, a GitHub wiki)" - "renders differently on another (say, converting to docbook using" - "pandoc). To make matters worse, because nothing in Markdown counts" @@ -164,8 +175,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - This document attempts to specify Markdown syntax unambiguously. - It contains many examples with side-by-side Markdown and - HTML. These are intended to double as conformance tests. An -- "accompanying script `spec_tests.py`" -- " can be used to run the tests\nagainst any Markdown program:" +- "accompanying script `spec_tests.py` can be used to run the tests" +- "against any Markdown program:" - python test/spec_tests.py --spec spec.txt --program PROGRAM - Since this document describes how Markdown is to be parsed into - "an abstract syntax tree, it would have made sense to use an abstract" @@ -173,8 +184,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "of representing the structural distinctions we need to make, and the" - choice of HTML for the tests makes it possible to run the tests against - an implementation without writing an abstract syntax tree renderer. -- "Note that not every feature of the HTML samples is mandated by\nthe spec." -- "For example, the spec says what counts as a link" +- Note that not every feature of the HTML samples is mandated by +- "the spec. For example, the spec says what counts as a link" - "destination, but it doesn't mandate that non-ASCII characters in" - "the URL be percent-encoded. To use the automatic tests," - implementers will need to provide a renderer that conforms to @@ -185,7 +196,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "This document is generated from a text file, `spec.txt`, written" - "in Markdown with a small extension for the side-by-side tests.\nThe script" - "`tools/makespec.py` can be used to convert `spec.txt`" -- "into\nHTML or CommonMark (which can then be converted into other formats)." +- " into\nHTML or CommonMark (which can then be converted into other formats)." - "In the examples, the `→` character is used to represent tabs." - "# Preliminaries" - "## Characters and lines" @@ -194,43 +205,46 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "code points (for example, combining accents) do not correspond to" - "characters in an intuitive sense, all code points count as characters" - for purposes of this spec. -- This spec does not specify an encoding; it thinks of lines as composed -- "of [characters] rather than bytes. A conforming parser may be limited" +- "This spec does not specify an encoding; it thinks of lines as composed\nof" +- "[characters] rather than bytes. A conforming parser may be limited" - to a certain encoding. - "A [line](@) is a sequence of zero or more [characters]" -- "other than line feed (`U+000A`) or carriage return (`U+" -- "000D`),\nfollowed by a [line ending]" +- "other than line feed (`U+000A`) or carriage return (" +- "`U+000D`),\nfollowed by a [line ending]" - or by the end of file. -- "A [line ending](@) is a line feed (`U+000A" -- "`), a carriage return\n(`U+000D`" +- "A [line ending](@) is a line feed (" +- "`U+000A`), a carriage return\n(" +- "`U+000D`" - ") not followed by a line feed, or a carriage return and a" - following line feed. -- "A line containing no characters, or a line containing only spaces" -- "(`U+0020`) or tabs (`U+0009`), is" -- "called a [blank line](@)." +- "A line containing no characters, or a line containing only spaces\n(" +- "`U+0020`) or tabs (`U+0009`" +- "), is called a [blank line](@)." - "The following definitions of character classes will be used in this spec:" -- "A [Unicode whitespace character](@) is a character in the Unicode `" -- "Zs` general\ncategory, or a tab (`U+0009`" -- "), line feed (`U+000A`), form feed (`U+" -- "000C`), or\ncarriage return (`U+000D`)." -- "[Unicode whitespace](@) is a sequence of one or more" -- "[Unicode whitespace characters]." +- "A [Unicode whitespace character](@) is a character in the Unicode" +- "`Zs` general\ncategory, or a tab (" +- "`U+0009`), line feed (`U+000A`" +- "), form feed (`U+000C`), or\ncarriage return (" +- "`U+000D`)." +- "[Unicode whitespace](@) is a sequence of one or more\n[" +- "Unicode whitespace characters]." - "A [tab](@) is `U+0009`." - "A [space](@) is `U+0020`." -- "An [ASCII control character](@) is a character between `U+" -- "0000–1F` (both\nincluding) or" +- "An [ASCII control character](@) is a character between" +- "`U+0000–1F` (both\nincluding) or" - "`U+007F`." -- "An [ASCII punctuation character](@)\nis `!" -- "`, `\"`, `#`, `$`, `%`, `&`, `'`, `(" -- "`, `)`,\n`*`, `+`, `,`, `-`, `.`," -- "`/` (U+0021–2F), \n`:`," -- "`;`, `<`, `=`, `>`, `?`, `@`" +- "An [ASCII punctuation character](@)\nis `!`, `\"`," +- "`#`, `$`, `%`, `&`, `'`, `(`," +- "`)`,\n`*`, `+`, `,`, `-`, `.`, `/`" +- " (U+0021–2F), \n`:`, `;`," +- "`<`, `=`, `>`, `?`, `@`" - " (U+003A–0040),\n`[`, `\\`," -- "`]`, `^`, `_`, `` ` `` (U+005B–" -- "0060), \n`{`, `|`, `}`, or `~`" -- (U+007B–007E). -- "A [Unicode punctuation character](@) is a character in the Unicode `P" -- "`\n(puncuation) or `S` (symbol) general categories." +- "`]`, `^`, `_`, `` ` ``" +- " (U+005B–0060), \n`{`, `|`," +- "`}`, or `~` (U+007B–007E)." +- "A [Unicode punctuation character](@) is a character in the Unicode" +- "`P`\n(puncuation) or `S`" +- (symbol) general categories. - "## Tabs" - "Tabs in lines are not expanded to [spaces]. However," - "in contexts where spaces help to define block structure," @@ -274,8 +288,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "Normally the `>` that begins a block quote may be followed" - "optionally by a space, which is not considered part of the" -- "content. In the following case `>`" -- "is followed by a tab," +- "content. In the following case `>` is followed by a tab," - which is treated as if it were expanded into three spaces. - "Since one of these spaces is considered part of the\ndelimiter, `foo`" - is considered to be indented six spaces @@ -401,13 +414,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Entity and numeric character references" - Valid HTML entity references and numeric character references - "can be used in place of the corresponding Unicode character,\nwith the following exceptions:" -- "- Entity and character references are not recognized in code" -- blocks and code spans. -- "- Entity and character references cannot stand in place of" +- "-" +- "Entity and character references are not recognized in code\n blocks and code spans." +- "-" +- Entity and character references cannot stand in place of - special characters that define structural elements in -- "CommonMark. For example, although `*`" -- " can be used\n in place of a literal `*` character," -- "`*` cannot replace\n `*`" +- "CommonMark. For example, although `*` can be used" +- "in place of a literal `*` character, `*`" +- " cannot replace\n `*`" - " in emphasis delimiters, bullet list markers, or thematic\n breaks." - Conforming CommonMark parsers need not store information about - whether a particular character was represented in the source @@ -426,23 +440,22 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

      & © Æ Ď" - "¾ ℋ ⅆ\n∲ ≧̸

    " - "````````````````````````````````" -- "[Decimal numeric character\nreferences](@)" -- "consist of `&#` + a string of 1--7 arabic" -- "digits + `;`" -- ". A\nnumeric character reference is parsed as the corresponding" +- "[Decimal numeric character\nreferences](@)\nconsist of `&#`" +- "+ a string of 1--7 arabic digits + `;`. A" +- numeric character reference is parsed as the corresponding - Unicode character. Invalid Unicode code points will be replaced by -- "the REPLACEMENT CHARACTER (`U+FFFD`" -- "). For security reasons,\nthe code point `U+0000`" -- "will also be replaced by `U+FFFD`." +- "the REPLACEMENT CHARACTER (`U+FFFD`). For security reasons," +- "the code point `U+0000` will also be replaced by" +- "`U+FFFD`." - "````````````````````````````````" - example - "# Ӓ Ϡ �\n." - "

    # Ӓ Ϡ �

    " - "````````````````````````````````" - "[Hexadecimal numeric character\nreferences](@) consist of `&#` +" -- "either `X` or `x` + a string of 1-6" -- "hexadecimal digits + `;`" -- ".\nThey too are parsed as the corresponding Unicode character (this" +- "either `X` or `x`" +- "+ a string of 1-6 hexadecimal digits + `;`." +- They too are parsed as the corresponding Unicode character (this - time specified with a hexadecimal numeral instead of decimal). - "````````````````````````````````" - example @@ -460,8 +473,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "&ThisIsNotDefined; &hi?;

    " - "````````````````````````````````" - Although HTML5 does accept some entity references -- "without a trailing semicolon (such as `©`" -- "), these are not\nrecognized here, because it makes the grammar too ambiguous:" +- "without a trailing semicolon (such as `©`), these are not" +- "recognized here, because it makes the grammar too ambiguous:" - "````````````````````````````````" - "example\n©\n.\n

    &copy

    " - "````````````````````````````````" @@ -540,18 +553,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    [a](url "tit")

    " - "````````````````````````````````" - "# Blocks and inlines" -- We can think of a document as a sequence of -- "[blocks](@)" +- "We can think of a document as a sequence of\n[blocks](@)" - "---structural elements like paragraphs, block" - "quotations, lists, headings, rules, and code blocks." - Some blocks (like - block quotes and list items) contain other blocks; others (like -- "headings and paragraphs) contain [inline](@)" -- "content---text," +- "headings and paragraphs) contain [inline](@) content---text," - "links, emphasized text, images, code spans, and so on." - "## Precedence" -- "Indicators of block structure always take precedence over indicators\nof inline structure." -- "So, for example, the following is a list with" +- Indicators of block structure always take precedence over indicators +- "of inline structure. So, for example, the following is a list with" - "two items, not a list with one item containing a code span:" - "````````````````````````````````" - example @@ -575,8 +586,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Markdown document. - "## Thematic breaks" - "A line consisting of optionally up to three spaces of indentation, followed by a" -- "sequence of three or more matching `-`, `_`, or `*` characters," -- "each followed\noptionally by any number of spaces or tabs, forms a" +- "sequence of three or more matching `-`, `_`, or `*`" +- "characters, each followed" +- "optionally by any number of spaces or tabs, forms a" - "[thematic break](@)." - "````````````````````````````````" - "example\n***\n---\n___\n.\n
    \n
    \n
    " @@ -672,9 +684,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## ATX headings" - "An [ATX heading](@)" - "consists of a string of characters, parsed as inline content, between an" -- "opening sequence of 1--6 unescaped `#`" -- " characters and an optional\nclosing sequence of any number of unescaped `#`" -- " characters.\nThe opening sequence of `#`" +- "opening sequence of 1--6 unescaped `#` characters and an optional" +- "closing sequence of any number of unescaped `#` characters." +- "The opening sequence of `#`" - "characters must be followed by spaces or tabs, or" - "by the end of line. The optional closing sequence of `#`" - s must be preceded by @@ -804,9 +816,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "interpretable as a [code fence], [ATX heading][ATX headings" - "],\n[block quote][block quotes], [thematic break][thematic breaks]," - "[list item][list items], or [HTML block][HTML blocks]." -- "A [setext heading underline](@) is a sequence of" -- "`=` characters or a sequence of `-` characters, with no more than" -- "3\nspaces of indentation and any number of trailing spaces or tabs." +- "A [setext heading underline](@) is a sequence of\n`=`" +- "characters or a sequence of `-` characters, with no more than 3" +- spaces of indentation and any number of trailing spaces or tabs. - "The heading is a level 1 heading if `=` characters are used in" - "the [setext heading underline], and a level 2 heading if `-`" - characters are used. The contents of the heading are the result @@ -888,8 +900,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    <a title="a lot

    " - "

    of dashes"/>

    " - "````````````````````````````````" -- "The setext heading underline cannot be a [lazy continuation" -- "line] in a list item or block quote:" +- "The setext heading underline cannot be a [lazy continuation\nline]" +- "in a list item or block quote:" - "````````````````````````````````" - example - "> Foo\n---\n.\n
    \n

    Foo

    " @@ -945,8 +957,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n-----\n.\n
    \n

    foo

    " - "
    \n
    " - "````````````````````````````````" -- "If you want a heading with `> foo` as its literal text, you" -- "can\nuse backslash escapes:" +- "If you want a heading with `> foo`" +- " as its literal text, you can\nuse backslash escapes:" - "````````````````````````````````" - example - "\\> foo\n------\n.\n

    > foo

    " @@ -975,8 +987,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Foo\nbar\n\n---\n\nbaz\n.\n

    Foo\nbar

    " - "
    \n

    baz

    " - "````````````````````````````````" -- "or use a thematic break that cannot count as a [setext heading" -- "underline], such as" +- "or use a thematic break that cannot count as a [setext heading\nunderline" +- "], such as" - "````````````````````````````````" - example - "Foo\nbar\n* * *\nbaz\n.\n

    Foo" @@ -991,8 +1003,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Indented code blocks" - "An [indented code block](@) is composed of one or more" - "[indented chunks] separated by blank lines.\nAn" -- "[indented chunk](@)" -- "is a sequence of non-blank lines," +- "[indented chunk](@) is a sequence of non-blank lines," - each preceded by four or more spaces of indentation. The contents of the code - "block are the literal contents of the lines, including trailing\n[line endings]" - ", minus four spaces of indentation.\nAn indented code block has no [" @@ -1083,21 +1094,21 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "## Fenced code blocks" - "A [code fence](@) is a sequence" -- "of at least three consecutive backtick characters (`` ` ``" -- ") or\ntildes (`~`" +- "of at least three consecutive backtick characters (`` ` ``) or" +- "tildes (`~`" - "). (Tildes and backticks cannot be mixed.)\nA" - "[fenced code block](@)" - "begins with a code fence, preceded by up to three spaces of indentation." - The line with the opening code fence may optionally contain some text - following the code fence; this is trimmed of leading and trailing -- "spaces or tabs and called the [info string](@)" -- ". If the [info string] comes" +- "spaces or tabs and called the [info string](@). If the [" +- "info string] comes" - "after a backtick fence, it may not contain any backtick" - characters. (The reason for this restriction is that otherwise - some inline code would be incorrectly interpreted as the - beginning of a fenced code block.) -- "The content of the code block consists of all subsequent lines, until" -- "a closing [code fence] of the same type as the code block" +- "The content of the code block consists of all subsequent lines, until\na closing" +- "[code fence] of the same type as the code block" - "began with (backticks or tildes), and with at least as" - many backticks - or tildes as the opening code fence. @@ -1166,9 +1177,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "~~~~\naaa\n~~~\n~~~~\n.\n

    aaa"
     - "~~~\n
    " - "````````````````````````````````" -- Unclosed code blocks are closed by the end of the document -- "(or the enclosing [block quote][block quotes] or [list item][list" -- "items]):" +- "Unclosed code blocks are closed by the end of the document\n(or the enclosing" +- "[block quote][block quotes] or [list item][list items]):" - "````````````````````````````````" - "example\n```\n.\n
    " - "````````````````````````````````" @@ -1288,8 +1298,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "``` aa ```\nfoo\n.\n

    aa" - foo

    - "````````````````````````````````" -- "[Info strings] for tilde code blocks can contain backticks and" -- "tildes:" +- "[Info strings]" +- "for tilde code blocks can contain backticks and tildes:" - "````````````````````````````````" - example - "~~~ aa ``` ~~~\nfoo\n~~~\n." @@ -1306,30 +1316,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - as raw HTML (and will not be escaped in HTML output). - "There are seven kinds of [HTML block], which can be defined by their" - start and end conditions. The block begins with a line that meets a -- "[start condition](@)" -- (after up to three optional spaces of indentation). +- "[start condition](@) (after up to three optional spaces of indentation)." - It ends with the first subsequent line that meets a matching -- "[end condition](@), or the last line of the document, or the last" -- "line of\nthe [container block](#container-blocks)" -- " containing the current HTML\nblock, if no line is encountered that meets the [" -- "end condition]. If\nthe first line meets both the [start condition]" -- " and the [end\ncondition], the block will contain just that line." -- "1. **Start condition:** line begins with the string ``" -- ", or the end of the line.\\\n**End condition:**" +- "[end condition](@)" +- ", or the last line of the document, or the last line of\nthe" +- "[container block](#container-blocks) containing the current HTML" +- "block, if no line is encountered that meets the [end condition]. If" +- "the first line meets both the [start condition] and the [end\ncondition" +- "], the block will contain just that line." +- "1." +- "**Start condition:** line begins with the string ``, or the end of the line.\\\n**End condition:**" - " line contains an end tag\n`
    `, ``" -- ", ``, or `` (case-insensitive;" -- "it\nneed not match the start tag)." -- "2. **Start condition:** line begins with the string ``." -- "3. **Start condition:** line begins with the string ``." -- "4. **Start condition:** line begins with the string ``." -- "5. **Start condition:** line begins with the string\n``." +- "5." +- "**Start condition:** line begins with the string\n``." - "6." - "**Start condition:** line begins with the string `<` or ``, or\nthe string `/>`.\\\n**End condition:**" +- "`title`, `tr`, `track`, `ul`, followed" +- "by a space, a tab, the end of the line, the string" +- "`>`, or\nthe string `/>`.\\\n**End condition:**" - "line is followed by a [blank line]." - "7." -- "**Start condition:** line begins with a complete [open tag]" -- "(with any [tag name] other than `pre`, `script`," -- "`style`, or `textarea`" -- ") or a complete [closing tag]," +- "**Start condition:** line begins with a complete [open tag]\n(with any" +- "[tag name] other than `pre`, `script`,\n`style`, or" +- "`textarea`) or a complete [closing tag]," - "followed by zero or more spaces and tabs, followed by the end of the" - "line.\\\n**End condition:** line is followed by a [blank line]." -- HTML blocks continue until they are closed by their appropriate -- "[end condition], or the last line of the document or other" +- "HTML blocks continue until they are closed by their appropriate\n[end condition]" +- ", or the last line of the document or other" - "[container\nblock](#container-blocks). This means any HTML" - "**within an HTML\nblock**" - that might otherwise be recognised as a start condition will - "be ignored by the parser and passed through as-is, without changing\nthe parser" - "'s state." -- "For instance, `
    ` within an HTML block started by `` will"
    -- not affect
    +- "For instance, `
    ` within an HTML block started by `
    `" +- will not affect - the parser state; as the HTML block was started in by start condition 6 - ", it\nwill end at any blank line. This can be surprising:" - "````````````````````````````````" @@ -1376,8 +1389,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    world.\n

    " - "
    " - "````````````````````````````````" -- "In this case, the HTML block is terminated by the blank line — the `" -- "**Hello**`" +- "In this case, the HTML block is terminated by the blank line — the" +- "`**Hello**`" - "text remains verbatim — and regular parsing resumes, with a paragraph," - "emphasised `world` and inline and block HTML following." - "All types of [HTML blocks] except type 7 may interrupt" @@ -1466,8 +1479,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n``` c\nint x = 33;\n```\n." - "
    \n``` c\nint x = 33;\n```" - "````````````````````````````````" -- "To start an [HTML block] with a tag that is *not* in" -- the +- "To start an [HTML block] with a tag that is *not*" +- in the - "list of block-level tags in (6), you must put the tag by" - "itself on the first line (and it must be complete):" - "````````````````````````````````" @@ -1500,18 +1513,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n*foo*\n\n.\n\n*foo*" - "" - "````````````````````````````````" -- "In this case, we get a raw HTML block that just includes" -- "the ``" -- tag (because it ends with the following blank +- "In this case, we get a raw HTML block that just includes\nthe" +- "`` tag (because it ends with the following blank" - "line). So the contents get interpreted as CommonMark:" - "````````````````````````````````" - example - "\n\n*foo*\n\n\n.\n" - "

    foo

    \n
    " - "````````````````````````````````" -- "Finally, in this case, the `` tags are interpreted" -- "as [raw HTML] *inside*" -- the CommonMark paragraph. (Because +- "Finally, in this case, the `` tags are interpreted\nas [" +- "raw HTML] *inside* the CommonMark paragraph. (Because" - "the tag is not on a line by itself, we get inline HTML" - "rather than an [HTML block].)" - "````````````````````````````````" @@ -1519,9 +1530,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo*\n." - "

    foo

    " - "````````````````````````````````" -- HTML tags designed to contain literal content -- "(`pre`, `script`, `style`, `textarea`), comments, processing" -- "instructions,\nand declarations are treated somewhat differently." +- "HTML tags designed to contain literal content\n(`pre`, `script`," +- "`style`, `textarea`), comments, processing instructions," +- and declarations are treated somewhat differently. - "Instead of ending at the first blank line, these blocks" - end at the first line containing a corresponding end tag. - "As a result, these blocks can contain blank lines:" @@ -1635,8 +1646,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n\n
    \n.\n
    " - "
    <div>\n
    " - "````````````````````````````````" -- "An HTML block of types 1--6 can interrupt a paragraph, and need" -- "not be\npreceded by a blank line." +- An HTML block of types 1-- +- "6 can interrupt a paragraph, and need not be" +- preceded by a blank line. - "````````````````````````````````" - example - "Foo\n
    \nbar\n
    \n.\n

    Foo

    " @@ -1658,9 +1670,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "This rule differs from John Gruber's original Markdown syntax" - "specification, which says:" -- "> The only restrictions are that block-level HTML elements —" -- "> e.g. `
    `, ``, `
    `,"
    -- "`

    `" +- ">" +- "The only restrictions are that block-level HTML elements —\n> e.g." +- "`

    `, `
    `, `
    `, `

    `" - ", etc. — must be separated from\n>" - "surrounding content by blank lines, and the start and end tags of the" - "> block should not be indented with spaces or tabs." @@ -1693,14 +1705,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - Some Markdown implementations have adopted a convention of - "interpreting content inside tags as text if the open tag has\nthe attribute" -- "`markdown=1`" -- ". The rule given above seems a simpler and" +- "`markdown=1`. The rule given above seems a simpler and" - "more elegant way of achieving the same expressive power, which is also" - much simpler to parse. - The main potential drawback is that one can no longer paste HTML - "blocks into Markdown documents with 100% reliability. However," -- "*in most cases*" -- "this will work fine, because the blank lines in" +- "*in most cases* this will work fine, because the blank lines in" - "HTML are usually followed by HTML block tags. For example:" - "````````````````````````````````" - example @@ -1709,8 +1719,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "\n

    " - "````````````````````````````````" - "There are problems, however, if the inner tags are indented" -- "*and*" -- "separated by spaces, as then they will be interpreted as" +- "*and* separated by spaces, as then they will be interpreted as" - "an indented code block:" - "````````````````````````````````" - example @@ -1720,19 +1729,18 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "</td>\n
    \n " - "" - "````````````````````````````````" -- "Fortunately, blank lines are usually not necessary and can be\ndeleted." -- "The exception is inside `
    `"
    -- " tags, but as described\n[above][HTML blocks]"
    -- ", raw HTML blocks starting with `
    `\n*can* contain blank lines."
    +- "Fortunately, blank lines are usually not necessary and can be"
    +- "deleted.  The exception is inside `
    ` tags, but as described"
    +- "[above][HTML blocks], raw HTML blocks starting with `
    `"
    +- "*can* contain blank lines."
     - "## Link reference definitions"
    -- "A [link reference definition](@)"
    -- "consists of a [link label], optionally preceded by up to three spaces of"
    -- "indentation, followed\nby a colon (`:`"
    -- "), optional spaces or tabs (including up to one\n[line ending]), a ["
    -- "link destination],\noptional spaces or tabs (including up to one\n[line ending]"
    -- "), and an optional [link\ntitle]"
    -- ", which if it is present must be separated\nfrom the [link destination]"
    -- " by spaces or tabs.\nNo further character may occur."
    +- "A [link reference definition](@)\nconsists of a [link label]"
    +- ", optionally preceded by up to three spaces of\nindentation, followed"
    +- "by a colon (`:`), optional spaces or tabs (including up to one"
    +- "[line ending]), a [link destination],"
    +- "optional spaces or tabs (including up to one\n[line ending]), and an optional"
    +- "[link\ntitle], which if it is present must be separated\nfrom the"
    +- "[link destination] by spaces or tabs.\nNo further character may occur."
     - "A [link reference definition]"
     - "does not correspond to a structural element of a document.  Instead, it"
     - "defines a label which can be used in [reference links]\nand reference-style ["
    @@ -1926,8 +1934,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "````````````````````````````````"
     - "## Paragraphs"
     - A sequence of non-blank lines that cannot be interpreted as other
    -- "kinds of blocks forms a [paragraph](@)"
    -- ".\nThe contents of the paragraph are the result of parsing the\nparagraph'"
    +- "kinds of blocks forms a [paragraph](@)."
    +- "The contents of the paragraph are the result of parsing the\nparagraph'"
     - "s raw content as inlines.  The paragraph's raw content"
     - is formed by concatenating the lines and removing initial and final
     - "spaces or tabs.\n\nA simple example with two paragraphs:"
    @@ -1993,43 +2001,43 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "[list items]."
     - We define the syntax for container blocks recursively.  The general
     - "form of the definition is:"
    -- "> If X is a sequence of blocks, then the result of"
    -- "> transforming X in such-and-such a way is a container of type Y"
    +- ">"
    +- "If X is a sequence of blocks, then the result of\n>"
    +- transforming X in such-and-such a way is a container of type Y
     - "> with these blocks as its content."
     - "So, we explain what counts as a block quote or list item by explaining"
    -- how these can be *generated*
    -- from their contents. This should suffice
    -- "to define the syntax, although it does not give a recipe for *parsing"
    -- "*\nthese constructions.  (A recipe is provided below in the section entitled"
    +- how these can be *generated* from their contents. This should suffice
    +- "to define the syntax, although it does not give a recipe for"
    +- "*parsing*"
    +- these constructions.  (A recipe is provided below in the section entitled
     - "[A parsing strategy](#appendix-a-parsing-strategy).)"
     - "## Block quotes"
     - "A [block quote marker](@),"
     - "optionally preceded by up to three spaces of indentation,"
    -- "consists of (a) the character `>`"
    -- together with a following space of
    -- "indentation, or (b) a single character `>` not followed by a"
    -- "space of\nindentation.\n\nThe following rules define [block quotes]:"
    -- 1.  **Basic case.
    -- "**  If a string of lines *Ls*"
    +- "consists of (a) the character `>` together with a following space of"
    +- "indentation, or (b) a single character `>`"
    +- " not followed by a space of\nindentation."
    +- "The following rules define [block quotes]:"
    +- "1."
    +- "**Basic case.**  If a string of lines *Ls*"
     - " constitute a sequence\n    of blocks *Bs*"
     - ", then the result of prepending a [block quote\n    marker]"
     - " to the beginning of each line in *Ls*\n    is a"
     - "[block quote](#block-quotes) containing *Bs*."
    -- 2.  **Laziness.
    -- "**  If a string of lines *Ls* constitute a"
    -- "[block\n    quote](#block-quotes) with contents *Bs*"
    -- ", then the result of deleting\n    the initial [block quote marker]"
    -- from one or
    +- "2."
    +- "**Laziness.**  If a string of lines *Ls*"
    +- " constitute a [block\n    quote](#block-quotes) with contents"
    +- "*Bs*, then the result of deleting\n    the initial [block quote marker"
    +- "] from one or"
     - more lines in which the next character other than a space or tab after the
     - "[block quote marker] is [paragraph continuation\n    text]"
     - is a block quote with *Bs* as its content.
    -- "[Paragraph continuation text](@)"
    -- is text
    +- "[Paragraph continuation text](@) is text"
     - "that will be parsed as part of the content of a paragraph, but does"
     - not occur at the beginning of the paragraph.
    -- 3.  **Consecutiveness.
    -- "**  A document cannot contain two [block\n    quotes]"
    -- "in a row unless there is a [blank line] between them."
    +- "3."
    +- "**Consecutiveness.**  A document cannot contain two [block"
    +- "quotes] in a row unless there is a [blank line] between them."
     - "Nothing else counts as a [block quote](#block-quotes)."
     - "Here is a simple example:"
     - "````````````````````````````````"
    @@ -2059,8 +2067,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md
     - "
    > # Foo\n> bar\n> baz"
     - "
    " - "````````````````````````````````" -- "The Laziness clause allows us to omit the `>` before" -- "[paragraph continuation text]:" +- "The Laziness clause allows us to omit the `>` before\n[" +- "paragraph continuation text]:" - "````````````````````````````````" - example - "> # Foo\n> bar\nbaz\n.\n
    " @@ -2106,8 +2114,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n
    " - "

    foo

    \n
    " - "````````````````````````````````" -- "Note that in the following case, we have a [lazy" -- "continuation line]:" +- "Note that in the following case, we have a [lazy\ncontinuation line" +- "]:" - "````````````````````````````````" - example - "> foo\n - bar\n.\n
    \n

    foo" @@ -2115,8 +2123,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "To see why, note that in" - "```markdown\n> foo\n> - bar\n```" -- "the `- bar` is indented too far to start a list, and" -- "can't\nbe an indented code block because indented code blocks cannot" +- "the `- bar` is indented too far to start a list, and can" +- "'t\nbe an indented code block because indented code blocks cannot" - "interrupt paragraphs, so it is [paragraph continuation text]." - "A block quote can be empty:" - "````````````````````````````````" @@ -2137,9 +2145,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> foo\n\n> bar\n.\n

    \n

    foo

    " - "
    \n
    \n

    bar

    \n
    " - "````````````````````````````````" -- "(Most current Markdown implementations, including John Gruber's" -- "original `Markdown.pl`" -- ", will parse this example as a single block quote" +- "(Most current Markdown implementations, including John Gruber's\noriginal" +- "`Markdown.pl`, will parse this example as a single block quote" - with two paragraphs. But it seems better to allow the author to decide - whether two block quotes or one are wanted.) - "Consecutiveness means that if we put these block quotes together," @@ -2185,9 +2192,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "> bar\n>\nbaz\n.\n
    \n

    bar

    " - "
    \n

    baz

    " - "````````````````````````````````" -- It is a consequence of the Laziness rule that any number -- "of initial `>`" -- "s may be omitted on a continuation line of a\nnested block quote:" +- "It is a consequence of the Laziness rule that any number\nof initial" +- "`>`s may be omitted on a continuation line of a" +- "nested block quote:" - "````````````````````````````````" - example - "> > > foo\nbar\n.\n
    \n
    " @@ -2200,10 +2207,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n

    foo\nbar\nbaz

    \n
    " - "
    \n
    " - "````````````````````````````````" -- "When including an indented code block in a block quote," -- "remember that the [block quote marker] includes\nboth the `>`" -- and a following space of indentation. So *five spaces* -- " are needed\nafter the `>`:" +- "When including an indented code block in a block quote,\nremember that the [" +- "block quote marker] includes\nboth the `>`" +- and a following space of indentation. So *five spaces* are needed +- "after the `>`:" - "````````````````````````````````" - example - "> code\n\n> not code\n.\n
    " @@ -2211,38 +2218,35 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    \n

    not code

    \n
    " - "````````````````````````````````" - "## List items" -- "A [list marker](@) is a" -- "[bullet list marker] or an [ordered list marker]." -- "A [bullet list marker](@)" -- "is a `-`, `+`, or `*` character." -- "An [ordered list marker](@)" -- "is a sequence of 1--9 arabic digits (`0-9`)" -- ", followed by either a\n`.` character or a `)`" -- character. (The reason for the length +- "A [list marker](@) is a\n[bullet list marker]" +- "or an [ordered list marker]." +- "A [bullet list marker](@)\nis a `-`, `+`, or" +- "`*` character." +- "An [ordered list marker](@)\nis a sequence of 1--" +- "9 arabic digits (`0-9`), followed by either a\n`.`" +- "character or a `)` character. (The reason for the length" - limit is that with 10 digits we start seeing integer overflows - "in some browsers.)\n\nThe following rules define [list items]:" -- 1. **Basic case. -- "** If a sequence of lines *Ls*" +- "1." +- "**Basic case.** If a sequence of lines *Ls*" - " constitute a sequence of\n blocks *Bs*" -- "starting with a character other than a space or tab, and *M*" -- " is\n a list marker of width *W* followed by 1 ≤" -- "*N* ≤ 4 spaces of indentation,\n then the result of prepending" +- "starting with a character other than a space or tab, and *M* is" +- a list marker of width *W* followed by 1 ≤ *N* +- " ≤ 4 spaces of indentation,\n then the result of prepending" - "*M* and the following spaces to the first line\n of" -- "*Ls*, and indenting subsequent lines of *Ls* by *W" -- "+ N* spaces, is a\n list item with *Bs*" -- as its contents. The type of the list item +- "*Ls*, and indenting subsequent lines of *Ls* by" +- "*W + N* spaces, is a\n list item with" +- "*Bs* as its contents. The type of the list item" - (bullet or ordered) is determined by the type of its list marker. - "If the list item is ordered, then it is also assigned a start" - "number, based on the ordered list marker.\n\n Exceptions:" - "1. When the first list item in a [list] interrupts" - "a paragraph---that is, when it starts on a line that would" - "otherwise count as [paragraph continuation text]---then (a)\n the lines" -- "*Ls*" -- "must not begin with a blank line, and (b) if" +- "*Ls* must not begin with a blank line, and (b) if" - "the list item is ordered, the start number must be 1." -- "2." -- "If any line is a [thematic break][thematic breaks] then" -- that line is not a list item. +- "2. If any line is a [thematic break][thematic breaks]" +- " then\n that line is not a list item." - "For example, let *Ls* be the lines" - "````````````````````````````````" - example @@ -2308,14 +2312,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    one

    \n

    two

    \n" - "\n
    \n
    " - "````````````````````````````````" -- "Here `two` occurs in the same column as the list marker `1." -- "`,\nbut is actually contained in the list item, because there is" +- "Here `two` occurs in the same column as the list marker `1.`" +- ",\nbut is actually contained in the list item, because there is" - sufficient indentation after the last containing blockquote marker. -- The converse is also possible. -- "In the following example, the word `two`" +- "The converse is also possible. In the following example, the word" +- "`two`" - "occurs far to the right of the initial text of the list item," -- "`one`" -- ", but" +- "`one`, but" - "it is not considered part of the list item, because it is not indented" - "far enough past the blockquote marker:" - "````````````````````````````````" @@ -2383,16 +2386,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - example - "-1. not ok\n.\n

    -1. not ok

    " - "````````````````````````````````" -- 2. **Item starting with indented code. -- "** If a sequence of lines *Ls*" -- constitute a sequence of blocks *Bs* -- " starting with an indented code\n block, and *M*" -- is a list marker of width *W* -- " followed by\n one space of indentation, then the result of prepending" -- "*M* and the\n following space to the first line of" -- "*Ls*, and indenting subsequent lines\n of *Ls* by" -- "*W + 1* spaces, is a list item with *Bs*" -- as its contents. +- 2. **Item starting with indented code.** +- " If a sequence of lines *Ls*\n constitute a sequence of blocks" +- "*Bs* starting with an indented code\n block, and" +- "*M* is a list marker of width *W* followed by" +- "one space of indentation, then the result of prepending *M* and the" +- "following space to the first line of *Ls*, and indenting subsequent lines" +- of *Ls* by *W + 1* +- "spaces, is a list item with *Bs* as its contents." - "If a line is empty, then it need not be indented." - The type of the - list item (bullet or ordered) is determined by the type of its list @@ -2415,8 +2416,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    bar\n
    \n" - "" - "````````````````````````````````" -- If the *first* block in the list item is an indented code block -- ",\nthen by rule #2, the contents must be preceded by *one*" +- If the *first* +- "block in the list item is an indented code block," +- "then by rule #2, the contents must be preceded by *one*" - " space of indentation\nafter the list marker:" - "````````````````````````````````" - example @@ -2471,15 +2473,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo

    \n

    bar

    \n" - "" - "````````````````````````````````" -- 3. **Item starting with a blank line. -- "** If a sequence of lines *Ls*" -- "starting with a single [blank line] constitute a (possibly empty)" -- "sequence of blocks *Bs*, and *M* is a list marker of width" -- "*W*,\n then the result of prepending *M*" -- " to the first line of *Ls*, and\n preceding subsequent lines of" -- "*Ls* by *W + 1*" -- " spaces of indentation, is a\n list item with *Bs*" -- as its contents. +- 3. **Item starting with a blank line.** +- " If a sequence of lines *Ls*\n starting with a single [" +- "blank line] constitute a (possibly empty)\n sequence of blocks *Bs*" +- ", and *M* is a list marker of width *W*," +- then the result of prepending *M* to the first line of +- "*Ls*, and\n preceding subsequent lines of *Ls* by" +- "*W + 1* spaces of indentation, is a" +- list item with *Bs* as its contents. - "If a line is empty, then it need not be indented." - The type of the - list item (bullet or ordered) is determined by the type of its list @@ -2501,8 +2502,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "" - "````````````````````````````````" - A list item can begin with at most one blank line. -- "In the following example, `foo`" -- " is not part of the list\nitem:" +- "In the following example, `foo` is not part of the list" +- "item:" - "````````````````````````````````" - example - "-\n\n foo\n.\n
      \n
    • \n
    " @@ -2538,9 +2539,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo\n*\n\nfoo\n1.\n.\n

    foo\n*

    " - "

    foo\n1.

    " - "````````````````````````````````" -- 4. **Indentation. -- "** If a sequence of lines *Ls*" -- constitutes a list item +- 4. **Indentation.** If a sequence of lines +- "*Ls* constitutes a list item" - "according to rule #1, #2, or #3, then the result" - "of preceding each line\n of *Ls*" - by up to three spaces of indentation (the same for each line) also @@ -2582,10 +2582,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " indented code\n\n > A block quote." - "
    " - "````````````````````````````````" -- 5. **Laziness. -- "** If a string of lines *Ls* constitute a" -- "[list\n item](#list-items) with contents *Bs*" -- ", then the result of deleting" +- 5. **Laziness.** If a string of lines +- "*Ls* constitute a [list\n item](#list-items)" +- "with contents *Bs*, then the result of deleting" - some or all of the indentation from one or more lines in which the - "next character other than a space or tab after the indentation is\n [" - "paragraph continuation text] is a" @@ -2621,11 +2620,11 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "continued here.

    \n\n\n" - "" - "````````````````````````````````" -- "6. **That's all." -- "** Nothing that is not counted as a list item by rules\n #1" -- "--5 counts as a [list item](#list-items)." -- The rules for sublists follow from the general rules -- "[above][List items]. A sublist must be indented the same number" +- "6. **That's all.**" +- " Nothing that is not counted as a list item by rules\n #1--" +- "5 counts as a [list item](#list-items)." +- "The rules for sublists follow from the general rules\n[above][List items" +- "]. A sublist must be indented the same number" - of spaces of indentation a paragraph would need to be in order to be included - "in the list item.\n\nSo, in this case we need two spaces indent:" - "````````````````````````````````" @@ -2685,10 +2684,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "2." - "\"To make lists look nice, you can wrap items with hanging indents...." - "But if you don't want to, you don't have to.\"" -- "3. \"List items may consist of multiple paragraphs. Each subsequent" +- "3." +- "\"List items may consist of multiple paragraphs. Each subsequent" - paragraph in a list item must be indented by either 4 spaces or one - "tab.\"" -- "4. \"It looks nice if you indent every line of the subsequent paragraphs," +- "4." +- "\"It looks nice if you indent every line of the subsequent paragraphs," - "but here again, Markdown will allow you to be lazy.\"" - "5." - "\"To put a blockquote within a list item, the blockquote's `>`" @@ -2704,15 +2705,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "that a block quote must be indented, but not by how much; however" - ", the\nexample given has four spaces indentation. Although nothing is said" - "about other kinds of block-level content, it is certainly reasonable to\ninfer that" -- "*all*" -- "block elements under a list item, including other" +- "*all* block elements under a list item, including other" - "lists, must be indented four spaces. This principle has been called the" - "*four-space rule*." -- "The four-space rule is clear and principled, and if the reference" -- "implementation `Markdown.pl`" -- "had followed it, it probably would have" -- "become the standard. However, `Markdown.pl`" -- allowed paragraphs and +- "The four-space rule is clear and principled, and if the reference\nimplementation" +- "`Markdown.pl` had followed it, it probably would have" +- "become the standard. However, `Markdown.pl` allowed paragraphs and" - "sublists to start with only two spaces indentation, at least on the" - "outer level. Worse, its behavior was inconsistent: a sublist of an" - "outer-level list needed two spaces indentation, but a sublist of this" @@ -2727,8 +2725,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - is no way to give a spec for list items that will be guaranteed not - "to break any existing documents. However, the spec given here should" - "correctly handle lists formatted with either the four-space rule or\nthe more forgiving" -- "`Markdown.pl`" -- "behavior, provided they are laid out" +- "`Markdown.pl` behavior, provided they are laid out" - in a way that is natural for a human to read. - The strategy here is to let the width and indentation of the list marker - determine the indentation necessary for blocks to fall under the list @@ -2743,27 +2740,31 @@ input_file: tests/inputs/markdown/commonmark_spec.md - unnatural. It is quite unintuitive that - "``` markdown\n- foo\n\n bar\n\n - baz\n```" - "should be parsed as two lists with an intervening paragraph," -- "``` html\n
      \n
    • foo
    • \n
    " +- "``` html" +- "
      \n
    • foo
    • \n
    " - "

    bar

    \n
      \n
    • baz
    • " - "
    \n```" - "as the four-space rule demands, rather than a single list," -- "``` html\n
      \n
    • \n

      foo

      " -- "

      bar

      \n
        \n
      • baz
      • " -- "
      \n
    • \n
    \n```" +- "``` html" +- "
      \n
    • \n

      foo

      \n

      bar

      " +- "
        \n
      • baz
      • \n
      \n
    • " +- "
    \n```" - The choice of four spaces is arbitrary. - "It can be learned, but it is" - "not likely to be guessed, and it trips up beginners regularly." - Would it help to adopt a two-space rule? The problem is that such - "a rule, together with the rule allowing up to three spaces of indentation for" -- "the initial list marker, allows text that is indented *less than*" -- " the\noriginal list marker to be included in the list item. For example," +- "the initial list marker, allows text that is indented *less than* the" +- "original list marker to be included in the list item. For example," - "`Markdown.pl` parses" - "``` markdown\n - one\n\n two\n```" - "as a single list item, with `two` a continuation paragraph:" -- "``` html\n
      \n
    • \n

      one

      " -- "

      two

      \n
    • \n
    \n```\n\nand similarly" +- "``` html" +- "
      \n
    • \n

      one

      \n

      two

      " +- "
    • \n
    \n```\n\nand similarly" - "``` markdown\n> - one\n>\n> two\n```\n\nas" -- "``` html\n
    \n
      \n
    • \n

      one

      " +- "``` html" +- "
      \n
        \n
      • \n

        one

        " - "

        two

        \n
      • \n
      \n
      \n```" - This is extremely unintuitive. - "Rather than requiring a fixed indent from the margin, we could require" @@ -2774,8 +2775,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "as a list item with a subparagraph, even though the paragraph `bar`" - "is not indented as far as the first paragraph `foo`:" - "``` markdown\n 10. foo\n\n bar \n```" -- "Arguably this text does read like a list item with `bar` as a" -- "subparagraph," +- "Arguably this text does read like a list item with `bar`" +- "as a subparagraph," - which may count in favor of the proposal. - "However, on this proposal indented" - code would have to be indented six spaces after the list marker. @@ -2795,29 +2796,30 @@ input_file: tests/inputs/markdown/commonmark_spec.md - four-space rule in cases where the list marker plus its initial indentation - "takes four spaces (a common case), but diverge in other cases." - "## Lists" -- "A [list](@) is a sequence of one or more" -- "list items [of the same type]. The list items" +- "A [list](@) is a sequence of one or more\nlist items" +- "[of the same type]. The list items" - may be separated by any number of blank lines. - "Two list items are [of the same type](@)" - "if they begin with a [list marker] of the same type." - Two list markers are of the - same type if (a) they are bullet list markers using the same character -- "(`-`, `+`, or `*`) or (b) they are" -- "ordered list numbers with the same\ndelimiter (either `.` or `)`)." +- "(`-`, `+`, or `*`" +- ) or (b) they are ordered list numbers with the same +- "delimiter (either `.` or `)`)." - "A list is an [ordered list](@)" - "if its constituent list items begin with\n[ordered list markers], and a" -- "[bullet list](@)" -- " if its constituent list\nitems begin with [bullet list markers]." -- "The [start number](@)" -- "of an [ordered list] is determined by the list number of" +- "[bullet list](@) if its constituent list\nitems begin with [" +- "bullet list markers]." +- "The [start number](@)\nof an [ordered list]" +- is determined by the list number of - its initial list item. The numbers of subsequent list items are - disregarded. - "A list is [loose](@) if any of its constituent" - "list items are separated by blank lines, or if any of its constituent" - list items directly contain two block-level elements with a blank line -- "between them. Otherwise a list is [tight](@)" -- ".\n(The difference in HTML output is that paragraphs in a loose list are" -- "wrapped in `

      ` tags, while paragraphs in a tight list are not.)" +- "between them. Otherwise a list is [tight](@)." +- "(The difference in HTML output is that paragraphs in a loose list are\nwrapped in" +- "`

      ` tags, while paragraphs in a tight list are not.)" - "Changing the bullet or ordered list delimiter starts a new list:" - "````````````````````````````````" - example @@ -2841,27 +2843,33 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "`Markdown.pl` does not allow this, through fear of triggering a list" - "via a numeral in a hard-wrapped line:" -- "``` markdown\nThe number of windows in my house is\n14." +- "``` markdown" +- "The number of windows in my house is\n14." - "The number of doors is 6.\n```" -- "Oddly, though, `Markdown.pl` *does* allow a blockquote" -- "to\ninterrupt a paragraph, even though the same considerations might\napply." +- "Oddly, though, `Markdown.pl` *does*" +- " allow a blockquote to\ninterrupt a paragraph, even though the same considerations might" +- apply. - "In CommonMark, we do allow lists to interrupt paragraphs, for" - "two reasons. First, it is natural and not uncommon for people" - "to start lists without blank lines:" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat" - "- a plane ticket\n```\n\nSecond, we are attracted to a" -- "> [principle of uniformity](@):" -- "> if a chunk of text has a certain\n>" +- ">" +- "[principle of uniformity](@):\n>" +- "if a chunk of text has a certain\n>" - "meaning, it will continue to have the same meaning when put into a\n>" - container block (such as a list item or blockquote). - "(Indeed, the spec for [list items] and [block quotes]" -- "presupposes\nthis principle.) This principle implies that if" -- "``` markdown\n * I need to buy\n - new shoes" -- " - a coat\n - a plane ticket\n```" +- " presupposes\nthis principle.) This principle implies that if" +- "``` markdown" +- " * I need to buy\n - new shoes\n - a coat" +- " - a plane ticket\n```" - "is a list item containing a paragraph followed by a nested sublist," - "as all Markdown implementations agree it is (though the paragraph\nmay be rendered without" - "`

      ` tags, since the list is \"tight\"),\nthen" -- "``` markdown\nI need to buy\n- new shoes\n- a coat" +- "``` markdown" +- "I need to buy\n- new shoes\n- a coat" - "- a plane ticket\n```" - by itself should be a paragraph followed by a nested sublist. - Since it is well established Markdown practice to allow lists to @@ -2871,8 +2879,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "takes a different approach, requiring blank lines before lists" - even inside other list items.) - In order to solve the problem of unwanted lists in paragraphs with -- "hard-wrapped numerals, we allow only lists starting with `1`" -- " to\ninterrupt paragraphs. Thus," +- "hard-wrapped numerals, we allow only lists starting with `1` to" +- "interrupt paragraphs. Thus," - "````````````````````````````````" - example - "The number of windows in my house is\n14." @@ -2942,8 +2950,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    • \n" - "````````````````````````````````" - "Note, however, that list items may not be preceded by more than" -- "three spaces of indentation. Here `- e`" -- is treated as a paragraph continuation +- "three spaces of indentation. Here `- e` is treated as a paragraph continuation" - "line, because it is indented more than three spaces:" - "````````````````````````````````" - example @@ -2952,8 +2959,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "
    • b
    • \n
    • c
    • \n
    • d" - "- e
    • \n
    " - "````````````````````````````````" -- "And here, `3. c` is treated as in indented code block" -- ",\nbecause it is indented four spaces and preceded by a\nblank line." +- "And here, `3. c`" +- "is treated as in indented code block," +- "because it is indented four spaces and preceded by a\nblank line." - "````````````````````````````````" - example - "1. a\n\n 2. b\n\n 3. c" @@ -3069,19 +3077,20 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`hi`lo`\n." - "

    hilo`

    " - "````````````````````````````````" -- "`hi` is parsed as code, leaving the backtick at the end as" -- "a literal\nbacktick." +- "`hi`" +- "is parsed as code, leaving the backtick at the end as a literal" +- backtick. - "## Code spans" - "A [backtick string](@)" -- "is a string of one or more backtick characters (`` ` ``) that" -- "is neither\npreceded nor followed by a backtick." +- "is a string of one or more backtick characters (`` ` ``" +- ") that is neither\npreceded nor followed by a backtick." - "A [code span](@) begins with a backtick string and ends with" - a backtick string of equal length. The contents of the code span are - "the characters between these two backtick strings, normalized in the\nfollowing ways:" - "- First, [line endings] are converted to [spaces]." - "- If the resulting string both begins *and* ends with a [space]" -- "character, but does not consist entirely of [space]" -- "characters, a single [space] character is removed from the" +- "character, but does not consist entirely of [space]\n characters, a single" +- "[space] character is removed from the" - front and back. This allows you to include code that begins - "or ends with backtick characters, which must be separated by" - whitespace from the opening or closing backtick strings. @@ -3139,8 +3148,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`foo bar \nbaz`\n." - "

    foo bar baz

    " - "````````````````````````````````" -- Note that browsers will typically collapse consecutive spaces -- "when rendering ``" +- "Note that browsers will typically collapse consecutive spaces\nwhen rendering ``" - " elements, so it is recommended that\nthe following CSS be used:" - "code{white-space: pre-wrap;}" - Note that backslash escapes do not work in code spans. All backslashes @@ -3150,9 +3158,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "`foo\\`bar`\n." - "

    foo\\bar`

    " - "````````````````````````````````" -- "Backslash escapes are never needed, because one can always choose a" -- string of *n* -- "backtick characters as delimiters, where the code does" +- "Backslash escapes are never needed, because one can always choose a\nstring of" +- "*n* backtick characters as delimiters, where the code does" - not contain any strings of exactly *n* backtick characters. - "````````````````````````````````" - example @@ -3167,8 +3174,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Code span backticks have higher precedence than any other inline - constructs except HTML tags and autolinks. - "Thus, for example, this is" -- "not parsed as emphasized text, since the second `*` is part of a" -- "code\nspan:" +- "not parsed as emphasized text, since the second `*`" +- " is part of a code\nspan:" - "````````````````````````````````" - example - "*foo`*`\n.\n

    *foo*

    " @@ -3224,10 +3231,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    `foobar

    " - "````````````````````````````````" - "## Emphasis and strong emphasis" -- "John Gruber's original [Markdown syntax" -- "description](https://daringfireball.net/projects/markdown/syntax#em" -- ") says:" -- "> Markdown treats asterisks (`*`) and underscores (`_`) as indicators of" +- "John Gruber's original" +- "[Markdown syntax\ndescription" +- "](https://daringfireball.net/projects/markdown/syntax#em)" +- "says:" +- ">" +- "Markdown treats asterisks (`*`) and underscores (`_`) as indicators of" - "> emphasis. Text wrapped with one `*` or `_`" - " will be wrapped with an HTML\n> `` tag; double" - "`*`'s or `_`'s will be wrapped with an HTML" @@ -3237,50 +3246,54 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " test suite makes it clear that triple `***` and\n`___`" - "delimiters can be used for strong emphasis, and most" - "implementations have also allowed the following patterns:" -- "``` markdown\n***strong emph***\n***strong** in emph*" +- "``` markdown" +- "***strong emph***\n***strong** in emph*" - "***emph* in strong**\n**in strong *emph***" - "*in emph **strong***\n```" - "The following patterns are less widely supported, but the intent" - "is clear and they are useful (especially in contexts like bibliography\nentries):" -- "``` markdown\n*emph *with emph* in it*" +- "``` markdown" +- "*emph *with emph* in it*" - "**strong **with strong** in it**\n```" -- Many implementations have also restricted intraword emphasis to -- "the `*`" +- "Many implementations have also restricted intraword emphasis to\nthe `*`" - "forms, to avoid unwanted emphasis in words containing" - internal underscores. (It is best practice to put these in code - "spans, but users often do not.)" -- "``` markdown\ninternal emphasis: foo*bar*baz" -- "no emphasis: foo_bar_baz\n```" +- "``` markdown" +- "internal emphasis: foo*bar*baz\nno emphasis: foo_bar_baz" +- "```" - "The rules given below capture all of these patterns, while allowing" - for efficient parsing strategies that do not backtrack. - "First, some definitions. A [delimiter run](@) is either" -- "a sequence of one or more `*`" -- " characters that is not preceded or\nfollowed by a non-backslash-escaped" -- "`*` character, or a sequence\nof one or more `_`" +- "a sequence of one or more `*` characters that is not preceded or" +- "followed by a non-backslash-escaped `*`" +- " character, or a sequence\nof one or more `_`" - " characters that is not preceded or followed by\na non-backslash-escaped" - "`_` character." -- "A [left-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not followed by [Unicode whitespace" -- "],\nand either (2a) not followed by a [Unicode punctuation character]" -- ", or\n(2b) followed by a [Unicode punctuation character] and" +- "A [left-flanking delimiter run](@) is\na [delimiter run" +- "] that is (1) not followed by [Unicode whitespace]," +- "and either (2a) not followed by a [Unicode punctuation character], or" +- "(2b) followed by a [Unicode punctuation character] and" - "preceded by [Unicode whitespace] or a [Unicode punctuation character]." - "For purposes of this definition, the beginning and the end of" - the line count as Unicode whitespace. -- "A [right-flanking delimiter run](@) is" -- "a [delimiter run] that is (1) not preceded by [Unicode whitespace" -- "],\nand either (2a) not preceded by a [Unicode punctuation character]" -- ", or\n(2b) preceded by a [Unicode punctuation character] and" +- "A [right-flanking delimiter run](@) is\na [delimiter run" +- "] that is (1) not preceded by [Unicode whitespace]," +- "and either (2a) not preceded by a [Unicode punctuation character], or" +- "(2b) preceded by a [Unicode punctuation character] and" - "followed by [Unicode whitespace] or a [Unicode punctuation character]." - "For purposes of this definition, the beginning and the end of" - "the line count as Unicode whitespace.\n\nHere are some examples of delimiter runs." -- " - left-flanking but not right-flanking:\n\n ```\n ***abc" -- " _abc\n **\"abc\"\n _\"abc\"\n ```" -- " - right-flanking but not left-flanking:\n\n ```\n abc***" -- " abc_\n \"abc\"**\n \"abc\"_\n ```" -- " - Both left and right-flanking:\n\n ```\n abc***def" -- " \"abc\"_\"def\"\n ```" -- " - Neither left nor right-flanking:\n\n ```\n abc *** def" -- " a _ b\n ```" +- "- left-flanking but not right-flanking:" +- " ```\n ***abc\n _abc\n **\"abc\"" +- " _\"abc\"\n ```" +- "- right-flanking but not left-flanking:" +- " ```\n abc***\n abc_\n \"abc\"**" +- "\"abc\"_\n ```" +- "- Both left and right-flanking:" +- " ```\n abc***def\n \"abc\"_\"def\"\n ```" +- "- Neither left nor right-flanking:" +- " ```\n abc *** def\n a _ b\n ```" - (The idea of distinguishing left-flanking and right-flanking - delimiter runs based on the character before and the character - "after comes from Roopesh Chander's" @@ -3291,31 +3304,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - and its rules for distinguishing left- and right-flanking runs - are a bit more complex than the ones given here.) - "The following rules define emphasis and strong emphasis:" -- "1. A single `*` character [can open emphasis](@)" -- "iff (if and only if) it is part of a [left-flanking" -- "delimiter run]." -- "2. A single `_` character [can open emphasis] iff" +- "1." +- "A single `*` character [can open emphasis](@)" +- "iff (if and only if) it is part of a [" +- "left-flanking delimiter run]." +- "2." +- "A single `_` character [can open emphasis] iff" - "it is part of a [left-flanking delimiter run]" - "and either (a) not part of a [right-flanking delimiter run]" - "or (b) part of a [right-flanking delimiter run]" - "preceded by a [Unicode punctuation character]." -- "3. A single `*` character [can close emphasis](@)" +- "3." +- "A single `*` character [can close emphasis](@)" - "iff it is part of a [right-flanking delimiter run]." -- "4. A single `_` character [can close emphasis] iff" +- "4." +- "A single `_` character [can close emphasis] iff" - "it is part of a [right-flanking delimiter run]" - "and either (a) not part of a [left-flanking delimiter run]" - "or (b) part of a [left-flanking delimiter run]" - "followed by a [Unicode punctuation character]." -- "5. A double `**` [can open strong emphasis](@)" +- "5." +- "A double `**` [can open strong emphasis](@)" - "iff it is part of a [left-flanking delimiter run]." -- "6. A double `__` [can open strong emphasis] iff" +- "6." +- "A double `__` [can open strong emphasis] iff" - "it is part of a [left-flanking delimiter run]" - "and either (a) not part of a [right-flanking delimiter run]" - "or (b) part of a [right-flanking delimiter run]" - "preceded by a [Unicode punctuation character]." -- "7. A double `**` [can close strong emphasis](@)" +- "7." +- "A double `**` [can close strong emphasis](@)" - "iff it is part of a [right-flanking delimiter run]." -- "8. A double `__` [can close strong emphasis] iff" +- "8." +- "A double `__` [can close strong emphasis] iff" - "it is part of a [right-flanking delimiter run]" - "and either (a) not part of a [left-flanking delimiter run]" - "or (b) part of a [left-flanking delimiter run]" @@ -3323,18 +3344,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "9." - "Emphasis begins with a delimiter that [can open emphasis] and ends" - "with a delimiter that [can close emphasis], and that uses the same" -- "character (`_` or `*`" -- ) as the opening delimiter. The +- "character (`_` or `*`) as the opening delimiter. The" - "opening and closing delimiters must belong to separate\n [delimiter runs]" - ". If one of the delimiters can both" - "open and close emphasis, then the sum of the lengths of the" - delimiter runs containing the opening and closing delimiters - must not be a multiple of 3 unless both lengths are - multiples of 3. -- 10. Strong emphasis begins with a delimiter that -- " [can open strong emphasis] and ends with a delimiter that\n [" -- "can close strong emphasis], and that uses the same character\n (`_`" -- "or `*`" +- "10." +- "Strong emphasis begins with a delimiter that\n [can open strong emphasis]" +- " and ends with a delimiter that\n [can close strong emphasis]" +- ", and that uses the same character\n (`_` or `*`" - ) as the opening delimiter. The - "opening and closing delimiters must belong to separate\n [delimiter runs]" - ". If one of the delimiters can both open" @@ -3342,32 +3362,39 @@ input_file: tests/inputs/markdown/commonmark_spec.md - the delimiter runs containing the opening and closing - delimiters must not be a multiple of 3 unless both lengths - are multiples of 3. -- "11. A literal `*` character cannot occur at the beginning or end of" -- "`*`-delimited emphasis or `**`-delimited strong emphasis" -- ", unless it\n is backslash-escaped." -- "12. A literal `_` character cannot occur at the beginning or end of" -- "`_`-delimited emphasis or `__`-delimited strong emphasis" -- ", unless it\n is backslash-escaped." +- "11." +- "A literal `*` character cannot occur at the beginning or end of" +- "`*`-delimited emphasis or `**`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." +- "12." +- "A literal `_` character cannot occur at the beginning or end of" +- "`_`-delimited emphasis or `__`" +- "-delimited strong emphasis, unless it\n is backslash-escaped." - "Where rules 1--12 above are compatible with multiple parsings," - "the following principles resolve ambiguity:" -- "13. The number of nestings should be minimized. Thus, for example," +- "13." +- "The number of nestings should be minimized. Thus, for example," - "an interpretation `...` is always preferred to" - "`...`." -- "14. An interpretation `...` is always" +- "14." +- "An interpretation `...` is always" - "preferred to `...`." -- "15. When two potential emphasis or strong emphasis spans overlap," +- "15." +- "When two potential emphasis or strong emphasis spans overlap," - so that the second begins before the first ends and ends after - "the first ends, the first takes precedence. Thus, for example," -- "`*foo _bar* baz_` is parsed as `foo" -- "_bar baz_` rather\n than" -- "`*foo bar* baz`." -- 16. When there are two potential emphasis or strong emphasis spans +- "`*foo _bar* baz_` is parsed as" +- "`foo _bar baz_` rather" +- "than `*foo bar* baz`." +- "16." +- When there are two potential emphasis or strong emphasis spans - "with the same closing delimiter, the shorter one (the one that" - "opens later) takes precedence. Thus, for example," -- "`**foo **bar baz**` is parsed as `**foo bar baz
    `\n rather than" +- "`**foo **bar baz**` is parsed as" +- "`**foo bar baz`\n rather than" - "`foo **bar baz`." -- "17. Inline code spans, links, images, and HTML tags group more tightly" +- "17." +- "Inline code spans, links, images, and HTML tags group more tightly" - "than emphasis. So, when there is a choice between an interpretation" - "that contains one of these elements and one that does not, the" - "former always wins. Thus, for example," @@ -3609,8 +3636,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "````````````````````````````````" - "example\n**foo bar **\n.\n

    **foo bar **

    " - "````````````````````````````````" -- "(Nor can it be interpreted as an emphasized `*foo bar *`, because" -- "of\nRule 11.)" +- "(Nor can it be interpreted as an emphasized `*foo bar *`" +- ", because of\nRule 11.)" - "This is not strong emphasis, because the second `**` is" - "preceded by punctuation and followed by an alphanumeric:" - "````````````````````````````````" @@ -3738,8 +3765,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foobar" - "baz

    \n```" - is precluded by the condition that a delimiter that -- "can both open and close (like the `*` after `foo`" -- ")\ncannot form emphasis if the sum of the lengths of" +- "can both open and close (like the `*` after `foo`)" +- cannot form emphasis if the sum of the lengths of - the delimiter runs containing the opening and - closing delimiters is a multiple of 3 unless - both lengths are multiples of 3. @@ -3767,8 +3794,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "*foo**bar***\n." - "

    foobar

    " - "````````````````````````````````" -- When the lengths of the interior closing and opening -- delimiter runs are *both* +- "When the lengths of the interior closing and opening\ndelimiter runs are *both*" - " multiples of 3, though,\nthey can match to create emphasis:" - "````````````````````````````````" - example @@ -3912,8 +3938,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "foo **_**\n.\n

    foo _

    " - "````````````````````````````````" - "Note that when delimiters do not match evenly, Rule 11 determines" -- "that the excess literal `*`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `*` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````````````````````" - example - "**foo*\n.\n

    *foo

    " @@ -3965,8 +3991,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "__foo_\n.\n

    _foo

    " - "````````````````````````````````" - "Note that when delimiters do not match evenly, Rule 12 determines" -- "that the excess literal `_`" -- " characters will appear outside of the\nemphasis, rather than inside it:" +- "that the excess literal `_` characters will appear outside of the" +- "emphasis, rather than inside it:" - "````````````````````````````````" - example - "_foo__\n.\n

    foo_

    " @@ -4115,58 +4141,64 @@ input_file: tests/inputs/markdown/commonmark_spec.md - " the\ndestination and title are given immediately after the link text. In" - "[reference links] the destination and title are defined elsewhere in\nthe document." - "A [link text](@) consists of a sequence of zero or more" -- "inline elements enclosed by square brackets (`[` and `]`" -- "). The\nfollowing rules apply:" -- "- Links may not contain other links, at any level of nesting. If" +- "inline elements enclosed by square brackets (`[` and `]`). The" +- "following rules apply:" +- "-" +- "Links may not contain other links, at any level of nesting. If" - multiple otherwise valid link definitions appear nested inside each - "other, the inner-most definition is used." -- "- Brackets are allowed in the [link text] only if (a)" -- they +- "-" +- "Brackets are allowed in the [link text] only if (a) they" - are backslash-escaped or (b) they appear as a matched pair of - "brackets,\n with an open bracket `[`" - ", a sequence of zero or more inlines, and\n a close bracket" - "`]`." -- "- Backtick [code spans], [autolinks], and raw [HTML" -- "tags] bind more tightly" +- "-" +- "Backtick [code spans], [autolinks], and raw [HTML tags" +- "] bind more tightly" - "than the brackets in link text. Thus, for example," - "`` [foo`]` `` could not be a link text, since the second" - "`]`\n is part of a code span." -- "- The brackets in link text bind more tightly than markers for" -- "[emphasis and strong emphasis]. Thus, for example," +- "-" +- "The brackets in link text bind more tightly than markers for\n [" +- "emphasis and strong emphasis]. Thus, for example," - "`*[foo*](url)` is a link." - "A [link destination](@) consists of either" -- "- a sequence of zero or more characters between an opening `<` and a" -- " closing `>` that contains no line endings or unescaped\n `<`" -- "or `>` characters, or" -- "- a nonempty sequence of characters that does not start with `<`," -- " does not include [ASCII control characters][ASCII control character]\n or [" -- "space] character, and includes parentheses only if (a) they are" +- "-" +- "a sequence of zero or more characters between an opening `<` and a" +- "closing `>` that contains no line endings or unescaped\n `<` or" +- "`>` characters, or" +- "-" +- "a nonempty sequence of characters that does not start with `<`," +- "does not include [ASCII control characters][ASCII control character]\n or [space" +- "] character, and includes parentheses only if (a) they are" - backslash-escaped or (b) they are part of a balanced pair of - "unescaped parentheses.\n (Implementations may impose limits on parentheses nesting to" - "avoid performance issues, but at least three levels of nesting" - "should be supported.)\n\nA [link title](@) consists of either" -- "- a sequence of zero or more characters between straight double-quote" -- "characters (`\"`), including a `\"`" -- " character only if it is\n backslash-escaped, or" -- "- a sequence of zero or more characters between straight single-quote" -- "characters (`'`), including a `'`" -- " character only if it is\n backslash-escaped, or" -- "- a sequence of zero or more characters between matching parentheses" -- "(`(...)`), including a `(` or `)`" -- " character only if it is\n backslash-escaped." -- "Although [link titles] may span multiple lines, they may not contain" -- "a [blank line]." +- "-" +- "a sequence of zero or more characters between straight double-quote\n characters (`\"`" +- "), including a `\"` character only if it is" +- "backslash-escaped, or" +- "-" +- "a sequence of zero or more characters between straight single-quote\n characters (" +- "`'`), including a `'` character only if it is" +- "backslash-escaped, or" +- "-" +- "a sequence of zero or more characters between matching parentheses\n (`(...)`" +- "), including a `(` or `)` character only if it is" +- backslash-escaped. +- "Although [link titles] may span multiple lines, they may not contain\na" +- "[blank line]." - "An [inline link](@) consists of a [link text] followed immediately" -- "by a left parenthesis `(`" -- ", an optional [link destination], an optional\n[link title]" -- ", and a right parenthesis `)`" -- "." +- "by a left parenthesis `(`, an optional [link destination], an optional" +- "[link title], and a right parenthesis `)`." - "These four components may be separated by spaces, tabs, and up to one line" - "ending.\nIf both [link destination] and [link title]" -- "are present, they *must*" -- " be\nseparated by spaces, tabs, and up to one line ending." -- "The link's text consists of the inlines contained" -- "in the [link text] (excluding the enclosing square brackets).\nThe link'" +- "are present, they *must* be" +- "separated by spaces, tabs, and up to one line ending." +- "The link's text consists of the inlines contained\nin the [link text" +- "] (excluding the enclosing square brackets).\nThe link'" - "s URI consists of the link destination, excluding enclosing\n`<...>`" - "if present, with backslash-escapes in effect as described" - "above. The link's title consists of the link title, excluding its" @@ -4365,8 +4397,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "brings, since there are already many ways---backslash escaping," - "entity and numeric character references, or using a different" - "quote type for the enclosing title---to write titles containing\ndouble quotes." -- "`Markdown.pl`" -- "'s handling of titles has a number" +- "`Markdown.pl`'s handling of titles has a number" - "of other strange features. For example, it allows single-quoted" - "titles in inline links, but not reference links. And, in" - "reference links but not inline links, it allows a title to begin\nwith" @@ -4483,13 +4514,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "search=](uri)

    " - "````````````````````````````````" - "There are three kinds of [reference link](@)s:" -- "[full](#full-reference-link), [collapsed](#collapsed-reference-link)" -- ",\nand [shortcut](#shortcut-reference-link)." -- "A [full reference link](@)" -- "consists of a [link text] immediately followed by a [link label]" -- "that [matches] a [link reference definition] elsewhere in the document." -- "A [link label](@) begins with a left bracket (`[`)" -- "and ends\nwith the first right bracket (`]`" +- "[full](#full-reference-link), [collapsed](#collapsed-reference-link)," +- "and [shortcut](#shortcut-reference-link)." +- "A [full reference link](@)\nconsists of a [link text]" +- " immediately followed by a [link label]\nthat [matches] a [" +- "link reference definition] elsewhere in the document." +- "A [link label](@) begins with a left bracket (`[`" +- ") and ends\nwith the first right bracket (`]`" - ) that is not backslash-escaped. - "Between these brackets there must be at least one character that is not a space," - "tab, or line ending.\nUnescaped square bracket characters are not allowed inside the" @@ -4498,24 +4529,23 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "One label [matches](@)" - another just in case their normalized forms are equal. To normalize a - "label, strip off the opening and closing brackets,\nperform the" -- "*Unicode case fold*" -- ", strip leading and trailing" +- "*Unicode case fold*, strip leading and trailing" - "spaces, tabs, and line endings, and collapse consecutive internal" - "spaces, tabs, and line endings to a single space." - If there are multiple - "matching reference link definitions, the one that comes first in the" - document is used. - (It is desirable in such cases to emit a warning.) -- "The link's URI and title are provided by the matching [link" -- "reference definition].\n\nHere is a simple example:" +- "The link's URI and title are provided by the matching [link\nreference definition" +- "].\n\nHere is a simple example:" - "````````````````````````````````" - example - "[foo][bar]\n\n[bar]: /url \"title\"\n." - "

    foo" - "````````````````````````````````" -- "The rules for the [link text] are the same as with" -- "[inline links]. Thus:" +- "The rules for the [link text] are the same as with\n[" +- "inline links]. Thus:" - "The link text may contain balanced brackets, but not unbalanced ones," - "unless they are escaped:" - "````````````````````````````````" @@ -4559,8 +4589,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    [foo bar baz]ref

    " - "````````````````````````````````" -- "(In the examples above, we have two [shortcut reference links]" -- "instead of one [full reference link].)" +- "(In the examples above, we have two [shortcut reference links]\ninstead of one" +- "[full reference link].)" - "The following cases illustrate the precedence of link text grouping over\nemphasis grouping:" - "````````````````````````````````" - example @@ -4613,7 +4643,7 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    Baz

    " - "````````````````````````````````" - "No spaces, tabs, or line endings are allowed between the [link text]" -- "and the\n[link label]:" +- " and the\n[link label]:" - "````````````````````````````````" - example - "[foo] [bar]\n\n[bar]: /url \"title\"\n." @@ -4635,11 +4665,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". If whitespace is allowed between the" - "link text and the link label, then in the following we will have" - "a single reference link, not two shortcut reference links, as\nintended:" -- "``` markdown\n[foo]\n[bar]\n\n[foo]: /url1" +- "``` markdown" +- "[foo]\n[bar]\n\n[foo]: /url1" - "[bar]: /url2\n```" - "(Note that [shortcut reference links] were introduced by Gruber" -- "himself in a beta version of `Markdown.pl`" -- ", but never included\nin the official syntax description. Without shortcut reference" +- "himself in a beta version of `Markdown.pl`, but never included" +- in the official syntax description. Without shortcut reference - "links, it is harmless to allow space between the link text and" - "link label; but once shortcut references are introduced, it is" - "too dangerous to allow this, as it frequently leads to" @@ -4651,8 +4682,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[bar][foo]\n." - "

    bar

    " - "````````````````````````````````" -- "Note that matching is performed on normalized strings, not parsed\ninline content." -- "So the following does not match, even though the" +- "Note that matching is performed on normalized strings, not parsed" +- "inline content. So the following does not match, even though the" - "labels define equivalent inline content:" - "````````````````````````````````" - example @@ -4690,8 +4721,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[bar\\\\]: /uri\n\n[bar\\\\]\n." - "

    bar\\

    " - "````````````````````````````````" -- "A [link label] must contain at least one character that is not a space" -- ", tab, or\nline ending:" +- "A [link label]" +- "must contain at least one character that is not a space, tab, or" +- "line ending:" - "````````````````````````````````" - example - "[]\n\n[]: /uri\n.\n

    []

    " @@ -4702,11 +4734,10 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[\n ]\n\n[\n ]: /uri\n.\n

    [\n]

    \n

    [" - "]: /uri

    " - "````````````````````````````````" -- "A [collapsed reference link](@)" -- "consists of a [link label] that [matches] a\n[" -- "link reference definition] elsewhere in the\ndocument, followed by the string" -- "`[]`" -- ".\nThe contents of the link label are parsed as inlines," +- "A [collapsed reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the" +- "document, followed by the string `[]`." +- "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link'" - s URI and title are - "provided by the matching reference link definition. Thus,\n`[foo][]`" @@ -4739,11 +4770,9 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo" - "[]

    " - "````````````````````````````````" -- "A [shortcut reference link](@)" -- "consists of a [link label] that [matches] a\n[" -- "link reference definition] elsewhere in the\ndocument and is not followed by" -- "`[]`" -- or a link label. +- "A [shortcut reference link](@)\nconsists of a [link label]" +- " that [matches] a\n[link reference definition] elsewhere in the" +- "document and is not followed by `[]` or a link label." - "The contents of the link label are parsed as inlines," - "which are used as the link's text. The link's URI and title" - "are provided by the matching link reference definition.\nThus, `[foo]`" @@ -4830,8 +4859,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "[foo][bar][baz]\n\n[baz]: /url\n." - "

    [foo]bar

    " - "````````````````````````````````" -- "Here, though, `[foo][bar]` is parsed as a reference," -- "since\n`[bar]` is defined:" +- "Here, though, `[foo][bar]`" +- " is parsed as a reference, since\n`[bar]` is defined:" - "````````````````````````````````" - example - "[foo][bar][baz]\n\n[baz]: /url1" @@ -4840,8 +4869,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "url1\">baz

    " - "````````````````````````````````" - "Here `[foo]` is not parsed as a shortcut reference, because it" -- "is followed by a link label (even though `[bar]` is not defined" -- "):" +- "is followed by a link label (even though `[bar]`" +- "is not defined):" - "````````````````````````````````" - example - "[foo][bar][baz]\n\n[baz]: /url1" @@ -4849,13 +4878,12 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    [foo]bar

    " - "````````````````````````````````" - "## Images" -- "Syntax for images is like the syntax for links, with one\ndifference." -- "Instead of [link text], we have an\n[image description](@)" -- ". The rules for this are the\nsame as for [link text]" -- ", except that (a) an\nimage description starts with `![`" -- "rather than `[`" -- ", and\n(b) an image description may contain links." -- An image description has inline elements +- "Syntax for images is like the syntax for links, with one" +- "difference. Instead of [link text], we have an" +- "[image description](@). The rules for this are the\nsame as for" +- "[link text], except that (a) an\nimage description starts with" +- "`![` rather than `[`, and" +- "(b) an image description may contain links.\nAn image description has inline elements" - "as its contents. When an image is rendered to HTML," - "this is standardly used as the image's `alt` attribute." - "````````````````````````````````" @@ -4884,8 +4912,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Though this spec is concerned with parsing, not rendering, it is" - "recommended that in rendering to HTML, only the plain string content\nof the [" - "image description] be used. Note that in" -- "the above example, the alt attribute's value is `foo bar`, not `" -- "foo\n[bar](/url)` or" +- "the above example, the alt attribute's value is `foo bar`, not" +- "`foo\n[bar](/url)` or" - "`foo bar`" - ". Only the plain string\ncontent is rendered, without formatting." - "````````````````````````````````" @@ -4992,8 +5020,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    \"Foo\" - "````````````````````````````````" -- "If you just want a literal `!" -- "` followed by bracketed text, you can\nbackslash-escape the opening" +- "If you just want a literal `!`" +- " followed by bracketed text, you can\nbackslash-escape the opening" - "`[`:" - "````````````````````````````````" - example @@ -5013,15 +5041,14 @@ input_file: tests/inputs/markdown/commonmark_spec.md - ". They are parsed as links, with the URL or email address" - as the link label. - "A [URI autolink](@) consists of `<`, followed by an" -- "[absolute URI] followed by `>`" -- ". It is parsed as" +- "[absolute URI] followed by `>`. It is parsed as" - "a link to the URI, with the URI as the link's label." -- "An [absolute URI](@)," -- "for these purposes, consists of a [scheme] followed by a colon (`:`" -- ")\nfollowed by zero or more characters other than [ASCII control\ncharacters][" -- "ASCII control character], [space], `<`, and `>`" -- ".\nIf the URI includes these characters, they must be percent-encoded" -- "(e.g. `%20` for a space)." +- "An [absolute URI](@),\nfor these purposes, consists of a [scheme" +- "] followed by a colon (`:`)" +- "followed by zero or more characters other than [ASCII control\ncharacters][" +- "ASCII control character], [space], `<`, and `>`." +- "If the URI includes these characters, they must be percent-encoded\n(e.g." +- "`%20` for a space)." - "For purposes of this spec, a [scheme](@) is any sequence" - of 2--32 characters beginning with an ASCII letter and followed - "by any combination of ASCII letters, digits, or the symbols plus\n(\"+\"" @@ -5095,16 +5122,15 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    https://example.com/\\[\\

    " - "````````````````````````````````" -- "An [email autolink](@)" -- "consists of `<`, followed by an [email address],\nfollowed by" -- "`>`" +- "An [email autolink](@)\nconsists of `<`" +- ", followed by an [email address],\nfollowed by `>`" - ". The link's label is the email address,\nand the URL is" - "`mailto:` followed by the email address." - "An [email address](@),\nfor these purposes, is anything that matches" - the -- "[non-normative regex from the HTML5" -- "spec](https://html.spec.whatwg.org/multipage/forms.html#e" -- "-mail-state-(type=email)):" +- "[non-normative regex from the HTML5\nspec" +- "](https://html.spec.whatwg.org/multipage/forms.html#e-mail" +- "-state-(type=email)):" - "/^[a-zA-Z0-9.!#$%&'*+/=?" - "^_`{|}~-]+@[a-zA-Z0-9](?:" - "[a-zA-Z0-9-]{0,61}[a-zA-Z0" @@ -5159,61 +5185,59 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "

    foo@bar.example.com

    " - "````````````````````````````````" - "## Raw HTML" -- "Text between `<` and `>` that looks like an HTML tag is parsed as" -- "a\nraw HTML tag and will be rendered in HTML without escaping." +- "Text between `<` and `>`" +- that looks like an HTML tag is parsed as a +- raw HTML tag and will be rendered in HTML without escaping. - "Tag and attribute names are not limited to current HTML tags," - "so custom tags (and even, say, DocBook tags) may be used" - ".\n\nHere is the grammar for tags:" - "A [tag name](@) consists of an ASCII letter" - "followed by zero or more ASCII letters, digits, or" - "hyphens (`-`)." -- "An [attribute](@) consists of spaces, tabs, and up to one" -- "line ending,\nan [attribute name], and an optional\n[attribute value specification" -- "]." -- "An [attribute name](@)" -- "consists of an ASCII letter, `_`, or `:`, followed by zero" -- "or more ASCII\nletters, digits, `_`, `.`, `:`, or" -- "`-`" +- "An [attribute](@)" +- " consists of spaces, tabs, and up to one line ending,\nan [" +- "attribute name], and an optional\n[attribute value specification]." +- "An [attribute name](@)\nconsists of an ASCII letter, `_`" +- ", or `:`, followed by zero or more ASCII\nletters, digits," +- "`_`, `.`, `:`, or `-`" - ". (Note: This is the XML" - specification restricted to ASCII. HTML5 is laxer.) - "An [attribute value specification](@)" - "consists of optional spaces, tabs, and up to one line ending,\na" -- "`=`" -- " character, optional spaces, tabs, and up to one line ending,\nand an" -- "[attribute value]." -- "An [attribute value](@)" -- "consists of an [unquoted attribute value],\na [" -- "single-quoted attribute value], or a [double-quoted attribute value]." +- "`=` character, optional spaces, tabs, and up to one line ending," +- "and an [attribute value]." +- "An [attribute value](@)\nconsists of an [unquoted attribute value" +- "],\na [single-quoted attribute value], or a [" +- "double-quoted attribute value]." - "An [unquoted attribute value](@)" - is a nonempty string of characters not -- "including spaces, tabs, line endings, `\"`, `'`, `=`, `<" -- "`, `>`, or `` ` ``." -- "A [single-quoted attribute value](@)" -- "consists of `'`, zero or more\ncharacters not including `'`" -- ", and a final `'`." -- "A [double-quoted attribute value](@)" -- "consists of `\"`, zero or more\ncharacters not including `\"`" -- ", and a final `\"`." +- "including spaces, tabs, line endings, `\"`, `'`, `=`," +- "`<`, `>`, or `` ` ``." +- "A [single-quoted attribute value](@)\nconsists of `'`" +- ", zero or more\ncharacters not including `'`, and a final `'`." +- "A [double-quoted attribute value](@)\nconsists of `\"`" +- ", zero or more\ncharacters not including `\"`, and a final `\"`." - "An [open tag](@) consists of a `<` character, a [" - "tag name],\nzero or more [attributes]" - ", optional spaces, tabs, and up to one line ending,\nan optional" - "`/` character, and a `>` character." - "A [closing tag](@) consists of the string ``." -- "An [HTML comment](@) consists of ``, `" -- "`, or ``, and `-->` (see the" +- "[tag name]" +- ", optional spaces, tabs, and up to one line ending, and the character" +- "`>`." +- "An [HTML comment](@) consists of ``," +- "``, or ``, and `-->` (see the" - "[HTML spec](https://html.spec.whatwg.org/multipage/" - "parsing.html#markup-declaration-open-state))." -- "A [processing instruction](@)\nconsists of the string ``, and the string" +- "A [processing instruction](@)\nconsists of the string ``, and the string" - "`?>`." -- "A [declaration](@) consists of the string ``" +- "A [declaration](@) consists of the string ``" - ", and the character `>`." -- "A [CDATA section](@) consists of\nthe string ``" +- "A [CDATA section](@) consists of\nthe string ``" - ", and the string `]]>`." - "An [HTML tag](@) consists of an [open tag], a [" - "closing tag],\nan [HTML comment], a [processing instruction], a [declaration" @@ -5343,13 +5367,13 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "## Hard line breaks" - A line ending (not in a code span or HTML tag) that is preceded - by two or more spaces and does not occur at the end of a block -- "is parsed as a [hard line break](@)" -- " (rendered\nin HTML as a `
    ` tag):" +- "is parsed as a [hard line break](@) (rendered" +- "in HTML as a `
    ` tag):" - "````````````````````````````````" - "example\nfoo \nbaz\n.\n

    foo
    \nbaz

    " - "````````````````````````````````" -- "For a more visible alternative, a backslash before the" -- "[line ending] may be used instead of two or more spaces:" +- "For a more visible alternative, a backslash before the\n[line ending]" +- "may be used instead of two or more spaces:" - "````````````````````````````````" - "example\nfoo\\\nbaz\n.\n

    foo
    \nbaz

    " - "````````````````````````````````" @@ -5457,27 +5481,29 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In this appendix we describe some features of the parsing strategy - used in the CommonMark reference implementations. - "## Overview\n\nParsing has two phases:" -- "1. In the first phase, lines of input are consumed and the block" +- "1." +- "In the first phase, lines of input are consumed and the block" - "structure of the document---its division into paragraphs, block quotes," - "list items, and so on---is constructed. Text is assigned to these" - blocks but not parsed. Link reference definitions are parsed and a - map of links is constructed. -- "2. In the second phase, the raw text contents of paragraphs and headings" +- "2." +- "In the second phase, the raw text contents of paragraphs and headings" - "are parsed into sequences of Markdown inline elements (strings," - "code spans, links, emphasis, and so on), using the map of link" - references constructed in phase 1. - "At each point in processing, the document is represented as a tree of" - "**blocks**. The root of the tree is a `document`" - " block. The `document`\nmay have any number of other blocks as" -- "**children**" -- ". These children" +- "**children**. These children" - "may, in turn, have other blocks as children." - "The last child of a block\nis normally considered **open**" - ", meaning that subsequent lines of input" - can alter its contents. (Blocks that are not open are **closed** - ".)\nHere, for example, is a possible document tree, with the open blocks" - "marked by arrows:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - " -> list (type=bullet tight=true bullet_char=-)\n list_item" - " paragraph\n \"Qui *quodsi iracundia*\"" @@ -5487,15 +5513,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - Each line that is processed has an effect on this tree. The line is - "analyzed and, depending on its contents, the document may be altered" - "in one or more of the following ways:" -- "1. One or more open blocks may be closed.\n2." -- One or more new blocks may be created as children of the +- 1. One or more open blocks may be closed. +- 2. One or more new blocks may be created as children of the - last open block. - 3. Text may be added to the last (deepest) open block remaining - on the tree. - "Once a line has been incorporated into the tree in this way," - "it can be discarded, so input can be read in a stream." - "For each line, we follow this procedure:" -- "1. First we iterate through the open blocks, starting with the" +- "1." +- "First we iterate through the open blocks, starting with the" - "root document, and descending through last children down to the last" - open block. Each block imposes a condition that the line must satisfy - if the block is to remain open. @@ -5504,14 +5531,16 @@ input_file: tests/inputs/markdown/commonmark_spec.md - In this phase we may match all or just some of the open - "blocks. But we cannot close unmatched blocks yet, because we may have a" - "[lazy continuation line]." -- "2. Next, after consuming the continuation markers for existing" -- "blocks, we look for new block starts (e.g. `>` for a" -- "block quote).\nIf we encounter a new block start, we close any blocks unmatched" +- "2." +- "Next, after consuming the continuation markers for existing" +- "blocks, we look for new block starts (e.g. `>`" +- for a block quote). +- "If we encounter a new block start, we close any blocks unmatched" - in step 1 before creating the new block as a child of the last - matched container block. -- "3. Finally, we look at the remainder of the line (after block" -- "markers like `>`" -- ", list markers, and indentation have been consumed)." +- "3." +- "Finally, we look at the remainder of the line (after block\nmarkers like" +- "`>`, list markers, and indentation have been consumed)." - This is text that can be incorporated into the last open - "block (a paragraph, code block, heading, or raw HTML)." - Setext headings are formed when we see a line of a paragraph @@ -5521,41 +5550,46 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "one or more reference link definitions. Any remainder becomes a\nnormal paragraph." - We can see how this works by considering how the tree above is - "generated by four lines of Markdown:" -- "``` markdown\n> Lorem ipsum dolor\nsit amet." +- "``` markdown" +- "> Lorem ipsum dolor\nsit amet." - "> - Qui *quodsi iracundia*" - "> - aliquando id\n```" - "At the outset, our document model is just" - "``` tree\n-> document\n```\n\nThe first line of our text," - "``` markdown\n> Lorem ipsum dolor\n```" - "causes a `block_quote` block to be created as a child of our" -- "open `document` block, and a `paragraph`" -- " block as a child of\nthe `block_quote`" -- ". Then the text is added to the last open\nblock, the" -- "`paragraph`:" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "open `document` block, and a `paragraph` block as a child of" +- "the `block_quote`. Then the text is added to the last open" +- "block, the `paragraph`:" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\"\n```\n\nThe next line," - "``` markdown\nsit amet.\n```" - "is a \"lazy continuation\" of the open `paragraph`, so it gets added" - "to the paragraph's text:" -- "``` tree\n-> document\n -> block_quote\n -> paragraph" +- "``` tree" +- "-> document\n -> block_quote\n -> paragraph" - " \"Lorem ipsum dolor\\nsit amet.\"\n```\n\nThe third line," - "``` markdown\n> - Qui *quodsi iracundia*\n```" -- "causes the `paragraph` block to be closed, and a new `list" -- "` block\nopened as a child of the `block_quote`. A" -- "`list_item` is also\nadded as a child of the `list`" -- ", and a `paragraph` as a child of\nthe `list_item`" -- ". The text is then added to the new `paragraph`:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "causes the `paragraph` block to be closed, and a new" +- "`list` block\nopened as a child of the `block_quote`" +- ". A `list_item` is also\nadded as a child of the" +- "`list`, and a `paragraph` as a child of\nthe" +- "`list_item`. The text is then added to the new `paragraph`:" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - "-> list (type=bullet tight=true bullet_char=-)" - " -> list_item\n -> paragraph" - " \"Qui *quodsi iracundia*\"\n```" - "The fourth line,\n\n``` markdown\n> - aliquando id\n```" -- "causes the `list_item` (and its child the `paragraph`) to" -- "be closed,\nand a new `list_item` opened up as child of the" -- "`list`. A `paragraph`\nis added as a child of the new" -- "`list_item`, to contain the text.\nWe thus obtain the final tree:" -- "``` tree\n-> document\n -> block_quote\n paragraph" +- "causes the `list_item` (and its child the `paragraph`" +- ") to be closed,\nand a new `list_item`" +- "opened up as child of the `list`. A `paragraph`" +- "is added as a child of the new `list_item`" +- ", to contain the text.\nWe thus obtain the final tree:" +- "``` tree" +- "-> document\n -> block_quote\n paragraph" - "\"Lorem ipsum dolor\\nsit amet.\"" - " -> list (type=bullet tight=true bullet_char=-)\n list_item" - " paragraph\n \"Qui *quodsi iracundia*\"" @@ -5567,16 +5601,17 @@ input_file: tests/inputs/markdown/commonmark_spec.md - string contents of paragraphs and headings as inlines. At this - "point we have seen all the link reference definitions, so we can" - resolve reference links as we go. -- "``` tree\ndocument\n block_quote\n paragraph" -- " str \"Lorem ipsum dolor\"\n softbreak" -- "str \"sit amet.\"" +- "``` tree" +- "document\n block_quote\n paragraph\n str \"Lorem ipsum dolor\"" +- " softbreak\n str \"sit amet.\"" - " list (type=bullet tight=true bullet_char=-)\n list_item" - " paragraph\n str \"Qui \"\n emph" - " str \"quodsi iracundia\"\n list_item" - " paragraph\n str \"aliquando id\"\n```" - "Notice how the [line ending] in the first paragraph has" -- "been parsed as a `softbreak`, and the asterisks in the first list" -- "item\nhave become an `emph`." +- "been parsed as a `softbreak`" +- ", and the asterisks in the first list item\nhave become an" +- "`emph`." - "### An algorithm for parsing nested emphasis and links" - "By far the trickiest part of inline parsing is handling emphasis," - "strong emphasis, links, and images. This is done using the following" @@ -5593,8 +5628,8 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "- whether the delimiter is a potential opener, a potential closer," - or both (which depends on what sort of characters precede - and follow the delimiters). -- "When we hit a `]` character, we call the *look for link" -- "or image*\nprocedure (see below)." +- "When we hit a `]` character, we call the" +- "*look for link or image*\nprocedure (see below)." - "When we hit the end of the input, we call the *process emphasis*" - "procedure (see below), with `stack_bottom` = NULL." - "#### *look for link or image*" @@ -5602,23 +5637,28 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "through the stack for an opening `[` or `![` delimiter." - "- If we don't find one, we return a literal text node `]" - "`." -- "- If we do find one, but it's not *active*, we remove" -- "the inactive\n delimiter from the stack, and return a literal text node" -- "`]`." -- "- If we find one and it's active, then we parse ahead to see" -- "if\n we have an inline link/image, reference link/image, collapsed reference" +- "-" +- "If we do find one, but it's not *active*" +- ", we remove the inactive" +- "delimiter from the stack, and return a literal text node `]`." +- "-" +- "If we find one and it's active, then we parse ahead to see if" +- "we have an inline link/image, reference link/image, collapsed reference" - "link/image, or shortcut reference link/image." -- "+ If we don't, then we remove the opening delimiter from the" +- + +- "If we don't, then we remove the opening delimiter from the" - "delimiter stack and return a literal text node `]`." - "+ If we do, then" -- "* We return a link or image node whose children are the inlines" +- "*" +- We return a link or image node whose children are the inlines - after the text node pointed to by the opening delimiter. -- "* We run *process emphasis* on these inlines, with the `[`" -- "opener\n as `stack_bottom`." -- "* We remove the opening delimiter." -- "* If we have a link (and not an image), we also set all" -- "`[` delimiters before the opening delimiter to *inactive*" -- ". (This\n will prevent us from getting links within links.)" +- "*" +- "We run *process emphasis* on these inlines, with the `[` opener" +- "as `stack_bottom`.\n\n * We remove the opening delimiter." +- "*" +- "If we have a link (and not an image), we also set all" +- "`[` delimiters before the opening delimiter to *inactive*. (This" +- will prevent us from getting links within links.) - "#### *process emphasis*" - "Parameter `stack_bottom` sets a lower bound to how far we" - "descend in the [delimiter stack]. If it is NULL, we can" @@ -5627,41 +5667,47 @@ input_file: tests/inputs/markdown/commonmark_spec.md - "Let `current_position` point to the element on the [delimiter stack]" - "just above `stack_bottom` (or the first element if `stack_bottom`" - is NULL). -- "We keep track of the `openers_bottom` for each delimiter" -- "type (`*`, `_`" -- "), indexed to the length of the closing delimiter run" +- "We keep track of the `openers_bottom` for each delimiter\ntype (" +- "`*`, `_`), indexed to the length of the closing delimiter run" - (modulo 3) and to whether the closing delimiter can also be an - "opener. Initialize this to `stack_bottom`." - "Then we repeat the following until we run out of potential\nclosers:" -- "- Move `current_position` forward in the delimiter stack (if needed)" -- "until we find the first potential closer with delimiter `*` or `_`" -- ".\n (This will be the potential closer closest" -- to the beginning of the input -- the first one in parse order.) -- "- Now, look back in the stack (staying above `stack_bottom`" -- "and\n the `openers_bottom`" -- " for this delimiter type) for the\n first matching potential opener (\"matching\"" -- " means same delimiter).\n\n- If one is found:" -- "+ Figure out whether we have emphasis or strong emphasis:" +- "-" +- "Move `current_position` forward in the delimiter stack (if needed)" +- "until we find the first potential closer with delimiter `*` or `_`." +- "(This will be the potential closer closest\n to the beginning of the input --" +- the first one in parse order.) +- "-" +- "Now, look back in the stack (staying above `stack_bottom` and" +- "the `openers_bottom` for this delimiter type) for the" +- "first matching potential opener (\"matching\" means same delimiter)." +- "- If one is found:" +- + +- "Figure out whether we have emphasis or strong emphasis:" - "if both closer and opener spans have length >= 2, we have" - "strong, otherwise regular." -- "+ Insert an emph or strong emph node accordingly, after" +- + +- "Insert an emph or strong emph node accordingly, after" - the text node corresponding to the opener. -- + Remove any delimiters between the opener and closer from -- the delimiter stack. -- + Remove 1 (for regular emph) or 2 (for strong emph -- ) delimiters +- + +- "Remove any delimiters between the opener and closer from\n the delimiter stack." +- + +- Remove 1 (for regular emph) or 2 (for strong emph) +- delimiters - from the opening and closing text nodes. If they become empty - "as a result, remove them and remove the corresponding element" - "of the delimiter stack. If the closing node is removed, reset" - "`current_position` to the next element in the stack." - "- If none is found:" -- "+ Set `openers_bottom` to the element before `current_position`." +- + +- "Set `openers_bottom` to the element before `current_position`." - (We know that there are no openers for this kind of closer up to - and - "including this point, so this puts a lower bound on future searches.)" -- "+ If the closer at `current_position` is not a potential opener," +- + +- "If the closer at `current_position` is not a potential opener," - "remove it from the delimiter stack (since we know it can't" - be a closer either). - "+ Advance `current_position` to the next element in the stack." -- "After we're done, we remove all delimiters above `stack_bottom` from" -- "the\ndelimiter stack." +- "After we're done, we remove all delimiters above `stack_bottom`" +- " from the\ndelimiter stack." diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap index 5fb9be0..358de17 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md-2.snap @@ -6,7 +6,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Headers\n\n```\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n```\n\n# h1 Heading 8-)\n## h2 Heading\n### h3 Heading\n#### h4 Heading\n##### h5 Heading\n###### h6 Heading\n\nAlternatively, for H1 and H2, an underline-ish style:\n\nAlt-H1\n======\n\nAlt-H2\n------\n\n------" - "# Emphasis\n\n```\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n```\n\nEmphasis, aka italics, with *asterisks* or _underscores_.\n\nStrong emphasis, aka bold, with **asterisks** or __underscores__.\n\nCombined emphasis with **asterisks and _underscores_**.\n\nStrikethrough uses two tildes. ~~Scratch this.~~\n\n**This is bold text**\n\n__This is bold text__\n\n*This is italic text*\n\n_This is italic text_\n\n~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3." +- "```" +- "1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item.\n\n⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3." - "Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback\n\n+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n```\n\n1. First ordered list item\n2. Another item\n⋅⋅* Unordered sub-list.\n1. Actual numbers don't matter, just that it's a number\n⋅⋅1. Ordered sub-list\n4. And another item." - "⋅⋅⋅You can have properly indented paragraphs within list items. Notice the blank line above, and the leading spaces (at least one, but we'll use three here to also align the raw Markdown).\n\n⋅⋅⋅To have a line break without a paragraph, you will need to use two trailing spaces.⋅⋅\n⋅⋅⋅Note that this line is separate, but within the same paragraph.⋅⋅\n⋅⋅⋅(This is contrary to the typical GFM line break behaviour, where trailing spaces are not required.)\n\n* Unordered list can use asterisks\n- Or minuses\n+ Or pluses\n\n1. Make my changes\n 1. Fix bug\n 2. Improve formatting\n - Make the headings bigger\n2. Push my commits to GitHub\n3. Open a pull request\n * Describe my changes\n * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `*`\n+ Sub-lists are made by indenting 2 spaces:\n - Marker character change forces new list start:\n * Ac tristique libero volutpat at\n + Facilisis in pretium nisl aliquet\n - Nulla volutpat aliquam velit\n+ Very easy!\n\n------" @@ -18,10 +19,12 @@ input_file: tests/inputs/markdown/github_flavored.md - "# [Footnotes](https://github.com/markdown-it/markdown-it-footnote)\n\n```\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n```\n\nFootnote 1 link[^first].\n\nFootnote 2 link[^second].\n\nInline footnote^[Text of inline footnote] definition.\n\nDuplicated footnote reference[^second].\n\n[^first]: Footnote **can have markup**\n\n and multiple paragraphs.\n\n[^second]: Footnote text.\n\n------" - "# Code and Syntax Highlighting\n\n```\nInline `code` has `back-ticks around` it.\n```\n\nInline `code` has `back-ticks around` it.\n\n```c#\nusing System.IO.Compression;\n\n#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{\n [Obsolete(\"...\")]\n class Program : IInterface\n {\n public static List JustDoIt(int count)\n {\n Console.WriteLine($\"Hello {Name}!\");\n return new List(new int[] { 1, 2, 3 })\n }\n }\n}\n```\n\n```css\n@font-face {\n font-family: Chunkfive; src: url('Chunkfive.otf');\n}\n\nbody, .usertext {\n color: #F0F0F0; background: #600;\n font-family: Chunkfive, sans;\n}\n\n@import url(print.css);\n@media print {\n a[href^=http]::after {\n content: attr(href)\n }\n}\n```" - "```javascript\nfunction $initHighlight(block, cls) {\n try {\n if (cls.search(/\\bno\\-highlight\\b/) != -1)\n return process(block, true, 0x0F) +\n ` class=\"${cls}\"`;\n } catch (e) {\n /* handle exception */\n }\n for (var i = 0 / 2; i < classes.length; i++) {\n if (checkCondition(classes[i]) === undefined)\n console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "```php\nrequire_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}" -- "echo URI::ME . URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" +- "```php" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory\n{\n abstract function test();\n\n public static $st1 = 1;\n const ME = \"Yo\";\n var $list = NULL;\n private $var;\n\n /**\n * Returns a URI\n *\n * @return URI\n */\n static public function _factory($stats = array(), $uri = 'http')\n {\n echo __METHOD__;\n $uri = explode(':', $uri, 0b10);\n $schemeSpecific = isset($uri[1]) ? $uri[1] : '';\n $desc = 'Multi\nline description';\n\n // Security check\n if (!ctype_alnum($scheme)) {\n throw new Zend_Uri_Exception('Illegal scheme');\n }\n\n $this->var = 0 - self::$st;\n $this->list = list(Array(\"1\"=> 2, 2=>self::ME, 3 => \\Location\\Web\\URI::class));\n\n return [\n 'uri' => $uri,\n 'value' => null,\n ];\n }\n}\n\necho URI::ME ." +- "URI::$st1;\n\n__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere\n```\n\n------" - "# Tables" -- "```\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" +- "```" +- "Colons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3\n\n| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |" - "| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n```\n\nColons can be used to align columns.\n\n| Tables | Are | Cool |\n| ------------- |:-------------:| -----:|\n| col 3 is | right-aligned | $1600 |\n| col 2 is | centered | $12 |\n| zebra stripes | are neat | $1 |\n\nThere must be at least 3 dashes separating each header cell.\nThe outer pipes (|) are optional, and you don't need to make the\nraw Markdown line up prettily. You can also use inline Markdown.\n\nMarkdown | Less | Pretty\n--- | --- | ---\n*Still* | `renders` | **nicely**\n1 | 2 | 3" - "| First Header | Second Header |\n| ------------- | ------------- |\n| Content Cell | Content Cell |\n| Content Cell | Content Cell |\n\n| Command | Description |\n| --- | --- |\n| git status | List all new or modified files |\n| git diff | Show file differences that haven't been staged |\n\n| Command | Description |\n| --- | --- |\n| `git status` | List all *new or modified* files |\n| `git diff` | Show file differences that **haven't been** staged |\n\n| Left-aligned | Center-aligned | Right-aligned |\n| :--- | :---: | ---: |\n| git status | git status | git status |\n| git diff | git diff | git diff |\n\n| Name | Character |\n| --- | --- |\n| Backtick | ` |\n| Pipe | \\| |\n\n------" - "# Blockquotes\n\n```\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n```\n\n> Blockquotes are very handy in email to emulate reply text.\n> This line is part of the same quote.\n\nQuote break.\n\n> This is a very long line that will still be quoted properly when it wraps. Oh boy let's keep writing to make sure this is long enough to actually wrap for everyone. Oh, you can *put* **Markdown** into a blockquote.\n\n> Blockquotes can also be nested...\n>> ...by using additional greater-than signs right next to each other...\n> > > ...or with spaces between arrows.\n\n------" diff --git a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap index 3d9ca03..6961f9b 100644 --- a/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap +++ b/tests/snapshots/text_splitter_snapshots__tiktoken_markdown_trim@github_flavored.md.snap @@ -4,7 +4,8 @@ expression: chunks input_file: tests/inputs/markdown/github_flavored.md --- - "# Headers" -- "```\n# h1 Heading 8-)\n## h2 Heading" +- "```" +- "# h1 Heading 8-)\n## h2 Heading" - "### h3 Heading\n#### h4 Heading\n##### h5 Heading" - "###### h6 Heading" - "Alternatively, for H1 and H2, an underline-ish style:" @@ -36,7 +37,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "*This is italic text*\n\n_This is italic text_" - "~~Strikethrough~~\n\n------" - "# Lists" -- "```\n1. First ordered list item\n2. Another item" +- "```" +- "1. First ordered list item\n2. Another item" - "⋅⋅* Unordered sub-list.\n1." - "Actual numbers don't matter, just that it's a number" - "⋅⋅1. Ordered sub-list\n4." @@ -83,11 +85,12 @@ input_file: tests/inputs/markdown/github_flavored.md - "GFM line break behaviour, where trailing spaces are not required.)" - "* Unordered list can use asterisks\n- Or minuses" - + Or pluses -- "1. Make my changes\n 1. Fix bug" +- 1. Make my changes +- 1. Fix bug - " 2. Improve formatting\n - Make the headings bigger" - "2. Push my commits to GitHub\n3. Open a pull request" -- " * Describe my changes\n * Mention all the members of my team" -- "* Ask for feedback" +- "* Describe my changes" +- " * Mention all the members of my team\n * Ask for feedback" - "+ Create a list by starting a line with `+`, `-`, or `" - "*`\n+ Sub-lists are made by indenting 2 spaces:" - "- Marker character change forces new list start:" @@ -95,8 +98,9 @@ input_file: tests/inputs/markdown/github_flavored.md - + Facilisis in pretium nisl aliquet - "- Nulla volutpat aliquam velit\n+ Very easy!\n\n------" - "# Task lists" -- "```\n- [x] Finish my changes" -- "- [ ] Push my commits to GitHub\n- [ ] Open a pull request" +- "```" +- "- [x] Finish my changes\n- [ ] Push my commits to GitHub" +- "- [ ] Open a pull request" - "- [x] @mentions, #refs, [links](), **" - "formatting**, and tags supported" - "- [x] list syntax required (any unordered or ordered list supported)" @@ -104,8 +108,9 @@ input_file: tests/inputs/markdown/github_flavored.md - "- [ ] this is an incomplete item\n```" - "- [x] Finish my changes\n- [ ] Push my commits to GitHub" - "- [ ] Open a pull request" -- "- [x] @mentions, #refs, [links](), **" -- "formatting**, and tags supported" +- "-" +- "[x] @mentions, #refs, [links](), **formatting**" +- ", and tags supported" - "- [x] list syntax required (any unordered or ordered list supported)" - "- [ ] this is a complete item" - "- [ ] this is an incomplete item\n\n------" @@ -118,7 +123,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "Let's rename \\*our-new-project\\* to \\*our-old-project" - "\\*.\n\n------" - "# Links" -- "```\n[I'm an inline-style link](https://www.google.com)" +- "```" +- "[I'm an inline-style link](https://www.google.com)" - "[I'm an inline-style link with title](https://www.google.com \"" - "Google's Homepage\")" - "[I'm a reference-style link][Arbitrary case-insensitive reference text]" @@ -140,15 +146,15 @@ input_file: tests/inputs/markdown/github_flavored.md - "[You can use numbers for reference-style link definitions][1]" - "Or leave it empty and use the [link text itself]." - URLs and URLs in angle brackets will automatically get turned into links. -- "http://www.example.com or " -- " and sometimes\nexample.com (but not on Github, for example)." +- "http://www.example.com or and sometimes" +- "example.com (but not on Github, for example)." - Some text to show that the reference links can follow later. - "[arbitrary case-insensitive reference text]: https://www.mozilla.org" - "[1]: http://slashdot.org" - "[link text itself]: http://www.reddit.com\n\n------" - "# Images" -- "```\nHere's our logo (hover to see the title text):" -- "Inline-style:\n![" +- "```" +- "Here's our logo (hover to see the title text):\n\nInline-style:\n![" - "alt text](https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 1\")\n\nReference-style:" - "![alt text][logo]" @@ -163,25 +169,29 @@ input_file: tests/inputs/markdown/github_flavored.md - "[id]: https://octodex.github.com/images/dojocat.jpg" - "\"The Dojocat\"\n```" - "Here's our logo (hover to see the title text):" -- "Inline-style:\n![" -- "alt text](https://github.com/adam-p/markdown-here/raw/master" -- "/src/common/images/icon48.png \"Logo Title Text 1\")" +- "Inline-style:" +- "![" +- alt text +- "](https://github.com/adam-p/markdown-here/raw/master/src/common" +- "/images/icon48.png \"Logo Title Text 1\")" - "Reference-style:\n![alt text][logo]" - "[logo]: https://github.com/adam-p/markdown-here/raw/master" - "/src/common/images/icon48.png \"Logo Title Text 2\"" - "![Minion](https://octodex.github.com/images/minion.png)" - "![" -- "Stormtroopocat](https://octodex.github.com/images/" -- "stormtroopocat.jpg \"The Stormtroopocat\")" +- Stormtroopocat +- "](https://octodex.github.com/images/stormtroopocat.jpg" +- "\"The Stormtroopocat\")" - "Like links, Images also have a footnote style syntax" - "![Alt text][id]" - "With a reference later in the document defining the URL location:" - "[id]: https://octodex.github.com/images/dojocat.jpg" - "\"The Dojocat\"\n\n------" -- "# [Footnotes](https://github.com/markdown-it/markdown-it-" -- footnote) -- "```\nFootnote 1 link[^first]." -- "Footnote 2 link[^second]." +- "#" +- "[Footnotes](https://github.com/markdown-it/markdown-it-footnote" +- ) +- "```" +- "Footnote 1 link[^first].\n\nFootnote 2 link[^second]." - "Inline footnote^[Text of inline footnote] definition." - "Duplicated footnote reference[^second]." - "[^first]: Footnote **can have markup**\n\n and multiple paragraphs." @@ -194,21 +204,24 @@ input_file: tests/inputs/markdown/github_flavored.md - "# Code and Syntax Highlighting" - "```\nInline `code` has `back-ticks around` it.\n```" - "Inline `code` has `back-ticks around` it." -- "```c#\nusing System.IO.Compression;" -- "#pragma warning disable 414, 3021\n\nnamespace MyApplication\n{" -- " [Obsolete(\"...\")]\n class Program : IInterface\n {" +- "```c#" +- "using System.IO.Compression;\n\n#pragma warning disable 414, 3021" +- "namespace MyApplication\n{\n [Obsolete(\"...\")]" +- " class Program : IInterface\n {" - " public static List JustDoIt(int count)\n {" - "Console.WriteLine($\"Hello {Name}!\");" - "return new List(new int[] { 1, 2," - "3 })\n }\n }\n}\n```" -- "```css\n@font-face {" +- "```css" +- "@font-face {" - "font-family: Chunkfive; src: url('Chunkfive.otf');" - "}\n\nbody, .usertext {" - "color: #F0F0F0; background: #600;" - " font-family: Chunkfive, sans;\n}\n\n@import url(print.css);" - "@media print {\n a[href^=http]::after {" - " content: attr(href)\n }\n}\n```" -- "```javascript\nfunction $initHighlight(block, cls) {\n try {" +- "```javascript" +- "function $initHighlight(block, cls) {\n try {" - "if (cls.search(/\\bno\\-highlight\\b/) != -1)" - "return process(block, true, 0x0F) +" - " ` class=\"${cls}\"`;\n } catch (e) {" @@ -216,10 +229,10 @@ input_file: tests/inputs/markdown/github_flavored.md - for (var i = 0 / 2; i < classes.length; - "i++) {\n if (checkCondition(classes[i]) === undefined)" - " console.log('undefined');\n }\n}\n\nexport $initHighlight;\n```" -- "```php\nrequire_once 'Zend/Uri/Http.php';" -- "namespace Location\\Web;\n\ninterface Factory\n{\n static function _factory();\n}" -- "abstract class URI extends BaseURI implements Factory\n{\n abstract function test();" -- public static $st1 = 1; +- "```php" +- "require_once 'Zend/Uri/Http.php';\n\nnamespace Location\\Web;\n\ninterface Factory" +- "{\n static function _factory();\n}\n\nabstract class URI extends BaseURI implements Factory" +- "{\n abstract function test();\n\n public static $st1 = 1;" - " const ME = \"Yo\";\n var $list = NULL;" - " private $var;\n\n /**\n * Returns a URI\n *" - " * @return URI\n */" @@ -240,7 +253,8 @@ input_file: tests/inputs/markdown/github_flavored.md - "__halt_compiler () ; datahere\ndatahere\ndatahere */\ndatahere" - "```\n\n------" - "# Tables" -- "```\nColons can be used to align columns." +- "```" +- Colons can be used to align columns. - "| Tables | Are | Cool |" - "| ------------- |:-------------:| -----:|" - "| col 3 is | right-aligned | $1600 |" @@ -267,14 +281,14 @@ input_file: tests/inputs/markdown/github_flavored.md - "| Name | Character |\n| --- | --- |" - "| Backtick | ` |\n| Pipe | \\| |\n```" - Colons can be used to align columns. -- "| Tables | Are | Cool |" -- "| ------------- |:-------------:| -----:|" +- "| Tables | Are | Cool" +- "|\n| ------------- |:-------------:| -----:|" - "| col 3 is | right-aligned | $1600 |" - "| col 2 is | centered | $12 |" - "| zebra stripes | are neat | $1 |" - There must be at least 3 dashes separating each header cell. -- "The outer pipes (|) are optional, and you don't need to make" -- the +- "The outer pipes (|) are optional, and you don'" +- t need to make the - raw Markdown line up prettily. You can also use inline Markdown. - "Markdown | Less | Pretty\n--- | --- | ---" - "*Still* | `renders` | **nicely**" @@ -289,14 +303,15 @@ input_file: tests/inputs/markdown/github_flavored.md - "| `git status` | List all *new or modified* files |" - "| `git diff` | Show file differences that **haven't been** staged" - "|" -- "| Left-aligned | Center-aligned | Right-aligned |" -- "| :--- | :---: | ---: |" +- "| Left-aligned | Center-aligned | Right-aligned" +- "|\n| :--- | :---: | ---: |" - "| git status | git status | git status |" - "| git diff | git diff | git diff |" - "| Name | Character |\n| --- | --- |" - "| Backtick | ` |\n| Pipe | \\| |\n\n------" - "# Blockquotes" -- "```\n> Blockquotes are very handy in email to emulate reply text." +- "```" +- "> Blockquotes are very handy in email to emulate reply text." - "> This line is part of the same quote.\n\nQuote break." - "> This is a very long line that will still be quoted properly when it wraps" - "." @@ -306,18 +321,20 @@ input_file: tests/inputs/markdown/github_flavored.md - "> Blockquotes can also be nested..." - ">> ...by using additional greater-than signs right next to each other..." - "> > > ...or with spaces between arrows.\n```" -- "> Blockquotes are very handy in email to emulate reply text." -- "> This line is part of the same quote.\n\nQuote break." -- "> This is a very long line that will still be quoted properly when it wraps" -- ". Oh boy let'" +- ">" +- "Blockquotes are very handy in email to emulate reply text.\n>" +- "This line is part of the same quote.\n\nQuote break." +- ">" +- This is a very long line that will still be quoted properly when it wraps. +- "Oh boy let'" - s keep writing to make sure this is long enough to actually wrap for everyone. - "Oh, you can *put* **Markdown** into a blockquote." -- "> Blockquotes can also be nested..." -- ">" -- "> ...by using additional greater-than signs right next to each other..." -- "> > > ...or with spaces between arrows.\n\n------" +- "> Blockquotes can also be nested...\n>" +- "> ...by using additional greater-than signs right next to each other...\n> >" +- "> ...or with spaces between arrows.\n\n------" - "# Inline HTML" -- "```\n
    \n
    Definition list
    " +- "```" +- "
    \n
    Definition list
    " - "
    Is something people use sometimes.
    " - "
    Markdown in HTML
    " - "
    Does *not* work **very** well." @@ -328,11 +345,13 @@ input_file: tests/inputs/markdown/github_flavored.md - "
    Does *not* work **very** well." - "Use HTML tags.
    \n
    \n\n------" - "# Horizontal Rules" -- "```\nThree or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks" -- "___\n\nUnderscores\n```\n\nThree or more...\n\n---\n\nHyphens" -- "***\n\nAsterisks\n\n___\n\nUnderscores\n\n------" +- "```" +- "Three or more...\n\n---\n\nHyphens\n\n***\n\nAsterisks\n\n___" +- "Underscores\n```\n\nThree or more...\n\n---\n\nHyphens\n\n***" +- "Asterisks\n\n___\n\nUnderscores\n\n------" - "# YouTube Videos" -- "```\n" - "\"IMAGE\n" -- "```\n[![" +- "```" +- "[![" - "IMAGE ALT TEXT HERE](http://img.youtube.com/vi/" - "YOUTUBE_VIDEO_ID_HERE/0.jpg)](http://www.youtube.com" - "/watch?v=YOUTUBE_VIDEO_ID_HERE)\n```" - "[![" -- "IMAGE ALT TEXT HERE](https://upload.wikimedia.org/wikipedia/commons/thumb/e" -- /ef/YouTube_logo_2015.svg/1200px- -- "YouTube_logo_2015.svg.png)](https://www.youtube.com/watch?" +- IMAGE ALT TEXT HERE +- "](https://upload.wikimedia.org/wikipedia/commons/thumb/e/ef/" +- YouTube_logo_2015.svg/1200px-YouTube_logo_2015 +- ".svg.png)](https://www.youtube.com/watch?" - v=ciawICBvQoE)