From 0e26ec53823ce66e879d5fb6748369acb18237f0 Mon Sep 17 00:00:00 2001 From: Miha Zupan Date: Sun, 17 Jul 2022 20:21:19 +0200 Subject: [PATCH] Align Whitespace and Punctuation definitions with CommonMark --- src/Markdig.Tests/TestCharHelper.cs | 92 +++++++++++++ src/Markdig.Tests/TestPipeTable.cs | 2 - src/Markdig/Helpers/CharHelper.cs | 125 +++++++++--------- src/Markdig/Helpers/ThrowHelper.cs | 2 +- .../Parsers/Inlines/EmphasisDescriptor.cs | 2 +- 5 files changed, 157 insertions(+), 66 deletions(-) create mode 100644 src/Markdig.Tests/TestCharHelper.cs diff --git a/src/Markdig.Tests/TestCharHelper.cs b/src/Markdig.Tests/TestCharHelper.cs new file mode 100644 index 000000000..69d8c2a62 --- /dev/null +++ b/src/Markdig.Tests/TestCharHelper.cs @@ -0,0 +1,92 @@ +using System.Collections.Generic; +using System.Globalization; +using Markdig.Helpers; +using NUnit.Framework; + +namespace Markdig.Tests +{ + public class TestCharHelper + { + // An ASCII punctuation character is + // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F), + // :, ;, <, =, >, ?, @ (U+003A–0040), + // [, \, ], ^, _, ` (U+005B–0060), + // {, |, }, or ~ (U+007B–007E). + private static readonly HashSet s_asciiPunctuation = new() + { + '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', + ':', ';', '<', '=', '>', '?', '@', + '[', '\\', ']', '^', '_', '`', + '{', '|', '}', '~' + }; + + // A Unicode punctuation character is an ASCII punctuation character or anything in the general Unicode categories + // Pc, Pd, Pe, Pf, Pi, Po, or Ps. + private static readonly HashSet s_punctuationCategories = new() + { + UnicodeCategory.ConnectorPunctuation, + UnicodeCategory.DashPunctuation, + UnicodeCategory.ClosePunctuation, + UnicodeCategory.FinalQuotePunctuation, + UnicodeCategory.InitialQuotePunctuation, + UnicodeCategory.OtherPunctuation, + UnicodeCategory.OpenPunctuation + }; + + private static bool ExpectedIsPunctuation(char c) + { + return c <= 127 + ? s_asciiPunctuation.Contains(c) + : s_punctuationCategories.Contains(CharUnicodeInfo.GetUnicodeCategory(c)); + } + + private static bool ExpectedIsWhitespace(char c) + { + // A Unicode whitespace character is any code point in the Unicode Zs general category, + // or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D). + return c == '\t' || c == '\n' || c == '\u000C' || c == '\r' || + CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator; + } + + [Test] + public void IsWhitespace() + { + for (int i = char.MinValue; i <= char.MaxValue; i++) + { + char c = (char)i; + + Assert.AreEqual(ExpectedIsWhitespace(c), CharHelper.IsWhitespace(c)); + } + } + + [Test] + public void CheckUnicodeCategory() + { + for (int i = char.MinValue; i <= char.MaxValue; i++) + { + char c = (char)i; + + bool expectedSpace = c == 0 || ExpectedIsWhitespace(c); + bool expectedPunctuation = c == 0 || ExpectedIsPunctuation(c); + + CharHelper.CheckUnicodeCategory(c, out bool spaceActual, out bool punctuationActual); + + Assert.AreEqual(expectedSpace, spaceActual); + Assert.AreEqual(expectedPunctuation, punctuationActual); + } + } + + [Test] + public void IsSpaceOrPunctuation() + { + for (int i = char.MinValue; i <= char.MaxValue; i++) + { + char c = (char)i; + + bool expected = c == 0 || ExpectedIsWhitespace(c) || ExpectedIsPunctuation(c); + + Assert.AreEqual(expected, CharHelper.IsSpaceOrPunctuation(c)); + } + } + } +} diff --git a/src/Markdig.Tests/TestPipeTable.cs b/src/Markdig.Tests/TestPipeTable.cs index ad06390c4..5b683927a 100644 --- a/src/Markdig.Tests/TestPipeTable.cs +++ b/src/Markdig.Tests/TestPipeTable.cs @@ -10,9 +10,7 @@ public sealed class TestPipeTable { [TestCase("| S | T |\r\n|---|---| \r\n| G | H |")] [TestCase("| S | T |\r\n|---|---|\t\r\n| G | H |")] - [TestCase("| S | T |\r\n|---|---|\v\r\n| G | H |")] [TestCase("| S | T |\r\n|---|---|\f\r\n| G | H |")] - [TestCase("| S | T |\r\n|---|---|\f\v\t \r\n| G | H |")] [TestCase("| S | \r\n|---|\r\n| G |\r\n\r\n| D | D |\r\n| ---| ---| \r\n| V | V |", 2)] public void TestTableBug(string markdown, int tableCount = 1) { diff --git a/src/Markdig/Helpers/CharHelper.cs b/src/Markdig/Helpers/CharHelper.cs index 690477315..fab96848a 100644 --- a/src/Markdig/Helpers/CharHelper.cs +++ b/src/Markdig/Helpers/CharHelper.cs @@ -53,7 +53,7 @@ public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWor // A right-flanking delimiter run is a delimiter run that is // (1) not preceded by Unicode whitespace, and either - // (1a) not preceded by a punctuation character, or + // (2a) not preceded by a punctuation character, or // (2b) preceded by a punctuation character and followed by Unicode whitespace or a punctuation character. // For purposes of this definition, the beginning and the end of the line count as Unicode whitespace. canClose = !prevIsWhiteSpace && @@ -144,9 +144,37 @@ public static bool Contains(this char[] charList, char c) [MethodImpl(MethodImplOptions.AggressiveInlining)] public static bool IsWhitespace(this char c) { - // 2.1 Characters and lines - // A whitespace character is a space(U + 0020), tab(U + 0009), newline(U + 000A), line tabulation (U + 000B), form feed (U + 000C), or carriage return (U + 000D). - return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'); + // 2.1 Characters and lines + // A Unicode whitespace character is any code point in the Unicode Zs general category, + // or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D). + if (c <= ' ') + { + const long Mask = + (1L << ' ') | + (1L << '\t') | + (1L << '\n') | + (1L << '\f') | + (1L << '\r'); + + return (Mask & (1L << c)) != 0; + } + + return c >= '\u00A0' && IsWhitespaceRare(c); + + static bool IsWhitespaceRare(char c) + { + // return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator; + + if (c < 5760) + { + return c == '\u00A0'; + } + else + { + return c <= 12288 && + (c == 5760 || IsInInclusiveRange(c, 8192, 8202) || c == 8239 || c == 8287 || c == 12288); + } + } } [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -171,46 +199,47 @@ public static bool IsWhiteSpaceOrZero(this char c) // Check if a char is a space or a punctuation public static void CheckUnicodeCategory(this char c, out bool space, out bool punctuation) { - // Credits: code from CommonMark.NET - // Copyright (c) 2014, Kārlis Gaņģis All rights reserved. - // See license for details: https://github.com/Knagis/CommonMark.NET/blob/master/LICENSE.md - if (c <= 'ÿ') + if (IsWhitespace(c)) + { + space = true; + punctuation = false; + } + else if (c <= 127) { - space = c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085'; - punctuation = c == '\0' || (c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126); + space = c == '\0'; + punctuation = c == '\0' || IsAsciiPunctuation(c); } else { - var category = CharUnicodeInfo.GetUnicodeCategory(c); - space = category == UnicodeCategory.SpaceSeparator - || category == UnicodeCategory.LineSeparator - || category == UnicodeCategory.ParagraphSeparator; - punctuation = !space && - (category == UnicodeCategory.ConnectorPunctuation + // A Unicode punctuation character is an ASCII punctuation character + // or anything in the general Unicode categories Pc, Pd, Pe, Pf, Pi, Po, or Ps. + space = false; + UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(c); + punctuation = category == UnicodeCategory.ConnectorPunctuation || category == UnicodeCategory.DashPunctuation || category == UnicodeCategory.OpenPunctuation || category == UnicodeCategory.ClosePunctuation || category == UnicodeCategory.InitialQuotePunctuation || category == UnicodeCategory.FinalQuotePunctuation - || category == UnicodeCategory.OtherPunctuation); + || category == UnicodeCategory.OtherPunctuation; } } // Same as CheckUnicodeCategory internal static bool IsSpaceOrPunctuation(this char c) { - if (c <= 'ÿ') + if (IsWhitespace(c)) { - return c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085' || - (c >= 33 && c <= 47 && c != 38) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126); + return true; + } + else if (c <= 127) + { + return c == '\0' || IsAsciiPunctuation(c); } else { var category = CharUnicodeInfo.GetUnicodeCategory(c); - return category == UnicodeCategory.SpaceSeparator - || category == UnicodeCategory.LineSeparator - || category == UnicodeCategory.ParagraphSeparator - || category == UnicodeCategory.ConnectorPunctuation + return category == UnicodeCategory.ConnectorPunctuation || category == UnicodeCategory.DashPunctuation || category == UnicodeCategory.OpenPunctuation || category == UnicodeCategory.ClosePunctuation @@ -289,44 +318,16 @@ public static bool IsDigit(this char c) public static bool IsAsciiPunctuation(this char c) { // 2.1 Characters and lines - // An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~. - switch (c) - { - case '!': - case '"': - case '#': - case '$': - case '%': - case '&': - case '\'': - case '(': - case ')': - case '*': - case '+': - case ',': - case '-': - case '.': - case '/': - case ':': - case ';': - case '<': - case '=': - case '>': - case '?': - case '@': - case '[': - case '\\': - case ']': - case '^': - case '_': - case '`': - case '{': - case '|': - case '}': - case '~': - return true; - } - return false; + // An ASCII punctuation character is + // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F), + // :, ;, <, =, >, ?, @ (U+003A–0040), + // [, \, ], ^, _, ` (U+005B–0060), + // {, |, }, or ~ (U+007B–007E). + return c <= 127 && ( + IsInInclusiveRange(c, 33, 47) || + IsInInclusiveRange(c, 58, 64) || + IsInInclusiveRange(c, 91, 96) || + IsInInclusiveRange(c, 123, 126)); } [MethodImpl(MethodImplOptions.AggressiveInlining)] diff --git a/src/Markdig/Helpers/ThrowHelper.cs b/src/Markdig/Helpers/ThrowHelper.cs index 6fab635de..0e088bc4a 100644 --- a/src/Markdig/Helpers/ThrowHelper.cs +++ b/src/Markdig/Helpers/ThrowHelper.cs @@ -55,7 +55,7 @@ internal static class ThrowHelper public static void ArgumentOutOfRangeException(string paramName) => throw new ArgumentOutOfRangeException(paramName); [DoesNotReturn] - public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(message, paramName); + public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(paramName, message); [DoesNotReturn] public static void ArgumentOutOfRangeException_index() => throw new ArgumentOutOfRangeException("index"); diff --git a/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs b/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs index dcf927112..5bf02152e 100644 --- a/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs +++ b/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs @@ -33,7 +33,7 @@ public EmphasisDescriptor(char character, int minimumCount, int maximumCount, bo /// /// The character of this emphasis. /// - public char Character { get; } + public char Character { get; } /// /// The minimum number of character this emphasis is expected to have (must be >=1)