From 0e26ec53823ce66e879d5fb6748369acb18237f0 Mon Sep 17 00:00:00 2001
From: Miha Zupan <mihazupan.zupan1@gmail.com>
Date: Sun, 17 Jul 2022 20:21:19 +0200
Subject: [PATCH] Align Whitespace and Punctuation definitions with CommonMark

---
 src/Markdig.Tests/TestCharHelper.cs           |  92 +++++++++++++
 src/Markdig.Tests/TestPipeTable.cs            |   2 -
 src/Markdig/Helpers/CharHelper.cs             | 125 +++++++++---------
 src/Markdig/Helpers/ThrowHelper.cs            |   2 +-
 .../Parsers/Inlines/EmphasisDescriptor.cs     |   2 +-
 5 files changed, 157 insertions(+), 66 deletions(-)
 create mode 100644 src/Markdig.Tests/TestCharHelper.cs
diff --git a/src/Markdig.Tests/TestCharHelper.cs b/src/Markdig.Tests/TestCharHelper.cs
new file mode 100644
index 000000000..69d8c2a62
--- /dev/null
+++ b/src/Markdig.Tests/TestCharHelper.cs
@@ -0,0 +1,92 @@
+using System.Collections.Generic;
+using System.Globalization;
+using Markdig.Helpers;
+using NUnit.Framework;
+
+namespace Markdig.Tests
+{
+    public class TestCharHelper
+    {
+        // An ASCII punctuation character is
+        // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
+        // :, ;, <, =, >, ?, @ (U+003A–0040),
+        // [, \, ], ^, _, ` (U+005B–0060),
+        // {, |, }, or ~ (U+007B–007E).
+        private static readonly HashSet<char> s_asciiPunctuation = new()
+        {
+            '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
+            ':', ';', '<', '=', '>', '?', '@',
+            '[', '\\', ']', '^', '_', '`',
+            '{', '|', '}', '~'
+        };
+
+        // A Unicode punctuation character is an ASCII punctuation character or anything in the general Unicode categories
+        // Pc, Pd, Pe, Pf, Pi, Po, or Ps.
+        private static readonly HashSet<UnicodeCategory> s_punctuationCategories = new()
+        {
+            UnicodeCategory.ConnectorPunctuation,
+            UnicodeCategory.DashPunctuation,
+            UnicodeCategory.ClosePunctuation,
+            UnicodeCategory.FinalQuotePunctuation,
+            UnicodeCategory.InitialQuotePunctuation,
+            UnicodeCategory.OtherPunctuation,
+            UnicodeCategory.OpenPunctuation
+        };
+
+        private static bool ExpectedIsPunctuation(char c)
+        {
+            return c <= 127
+                ? s_asciiPunctuation.Contains(c)
+                : s_punctuationCategories.Contains(CharUnicodeInfo.GetUnicodeCategory(c));
+        }
+
+        private static bool ExpectedIsWhitespace(char c)
+        {
+            // A Unicode whitespace character is any code point in the Unicode Zs general category,
+            // or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
+            return c == '\t' || c == '\n' || c == '\u000C' || c == '\r' ||
+                CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
+        }
+
+        [Test]
+        public void IsWhitespace()
+        {
+            for (int i = char.MinValue; i <= char.MaxValue; i++)
+            {
+                char c = (char)i;
+
+                Assert.AreEqual(ExpectedIsWhitespace(c), CharHelper.IsWhitespace(c));
+            }
+        }
+
+        [Test]
+        public void CheckUnicodeCategory()
+        {
+            for (int i = char.MinValue; i <= char.MaxValue; i++)
+            {
+                char c = (char)i;
+
+                bool expectedSpace = c == 0 || ExpectedIsWhitespace(c);
+                bool expectedPunctuation = c == 0 || ExpectedIsPunctuation(c);
+
+                CharHelper.CheckUnicodeCategory(c, out bool spaceActual, out bool punctuationActual);
+
+                Assert.AreEqual(expectedSpace, spaceActual);
+                Assert.AreEqual(expectedPunctuation, punctuationActual);
+            }
+        }
+
+        [Test]
+        public void IsSpaceOrPunctuation()
+        {
+            for (int i = char.MinValue; i <= char.MaxValue; i++)
+            {
+                char c = (char)i;
+
+                bool expected = c == 0 || ExpectedIsWhitespace(c) || ExpectedIsPunctuation(c);
+
+                Assert.AreEqual(expected, CharHelper.IsSpaceOrPunctuation(c));
+            }
+        }
+    }
+}
diff --git a/src/Markdig.Tests/TestPipeTable.cs b/src/Markdig.Tests/TestPipeTable.cs
index ad06390c4..5b683927a 100644
--- a/src/Markdig.Tests/TestPipeTable.cs
+++ b/src/Markdig.Tests/TestPipeTable.cs
@@ -10,9 +10,7 @@ public sealed class TestPipeTable
     {
         [TestCase("| S | T |\r\n|---|---| \r\n| G | H |")]
         [TestCase("| S | T |\r\n|---|---|\t\r\n| G | H |")]
-        [TestCase("| S | T |\r\n|---|---|\v\r\n| G | H |")]
         [TestCase("| S | T |\r\n|---|---|\f\r\n| G | H |")]
-        [TestCase("| S | T |\r\n|---|---|\f\v\t \r\n| G | H |")]
         [TestCase("| S | \r\n|---|\r\n| G |\r\n\r\n| D | D |\r\n| ---| ---| \r\n| V | V |", 2)]
         public void TestTableBug(string markdown, int tableCount = 1)
         {
diff --git a/src/Markdig/Helpers/CharHelper.cs b/src/Markdig/Helpers/CharHelper.cs
index 690477315..fab96848a 100644
--- a/src/Markdig/Helpers/CharHelper.cs
+++ b/src/Markdig/Helpers/CharHelper.cs
@@ -53,7 +53,7 @@ public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWor
 
             // A right-flanking delimiter run is a delimiter run that is
             // (1) not preceded by Unicode whitespace, and either
-            // (1a) not preceded by a punctuation character, or
+            // (2a) not preceded by a punctuation character, or
             // (2b) preceded by a punctuation character and followed by Unicode whitespace or a punctuation character.
             // For purposes of this definition, the beginning and the end of the line count as Unicode whitespace.
             canClose = !prevIsWhiteSpace &&
@@ -144,9 +144,37 @@ public static bool Contains(this char[] charList, char c)
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static bool IsWhitespace(this char c)
         {
-            // 2.1 Characters and lines 
-            // A whitespace character is a space(U + 0020), tab(U + 0009), newline(U + 000A), line tabulation (U + 000B), form feed (U + 000C), or carriage return (U + 000D).
-            return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r');
+            // 2.1 Characters and lines
+            // A Unicode whitespace character is any code point in the Unicode Zs general category,
+            // or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
+            if (c <= ' ')
+            {
+                const long Mask =
+                    (1L << ' ') |
+                    (1L << '\t') |
+                    (1L << '\n') |
+                    (1L << '\f') |
+                    (1L << '\r');
+
+                return (Mask & (1L << c)) != 0;
+            }
+
+            return c >= '\u00A0' && IsWhitespaceRare(c);
+
+            static bool IsWhitespaceRare(char c)
+            {
+                // return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
+
+                if (c < 5760)
+                {
+                    return c == '\u00A0';
+                }
+                else
+                {
+                    return c <= 12288 &&
+                        (c == 5760 || IsInInclusiveRange(c, 8192, 8202) || c == 8239 || c == 8287 || c == 12288);
+                }
+            }
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -171,46 +199,47 @@ public static bool IsWhiteSpaceOrZero(this char c)
         // Check if a char is a space or a punctuation
         public static void CheckUnicodeCategory(this char c, out bool space, out bool punctuation)
         {
-            // Credits: code from CommonMark.NET
-            // Copyright (c) 2014, Kārlis Gaņģis All rights reserved. 
-            // See license for details:  https://github.com/Knagis/CommonMark.NET/blob/master/LICENSE.md
-            if (c <= 'ÿ')
+            if (IsWhitespace(c))
+            {
+                space = true;
+                punctuation = false;
+            }
+            else if (c <= 127)
             {
-                space = c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085';
-                punctuation = c == '\0' || (c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
+                space = c == '\0';
+                punctuation = c == '\0' || IsAsciiPunctuation(c);
             }
             else
             {
-                var category = CharUnicodeInfo.GetUnicodeCategory(c);
-                space = category == UnicodeCategory.SpaceSeparator
-                    || category == UnicodeCategory.LineSeparator
-                    || category == UnicodeCategory.ParagraphSeparator;
-                punctuation = !space &&
-                    (category == UnicodeCategory.ConnectorPunctuation
+                // A Unicode punctuation character is an ASCII punctuation character
+                // or anything in the general Unicode categories Pc, Pd, Pe, Pf, Pi, Po, or Ps.
+                space = false;
+                UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(c);
+                punctuation = category == UnicodeCategory.ConnectorPunctuation
                     || category == UnicodeCategory.DashPunctuation
                     || category == UnicodeCategory.OpenPunctuation
                     || category == UnicodeCategory.ClosePunctuation
                     || category == UnicodeCategory.InitialQuotePunctuation
                     || category == UnicodeCategory.FinalQuotePunctuation
-                    || category == UnicodeCategory.OtherPunctuation);
+                    || category == UnicodeCategory.OtherPunctuation;
             }
         }
 
         // Same as CheckUnicodeCategory
         internal static bool IsSpaceOrPunctuation(this char c)
         {
-            if (c <= 'ÿ')
+            if (IsWhitespace(c))
             {
-                return c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085' ||
-                    (c >= 33 && c <= 47 && c != 38) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
+                return true;
+            }
+            else if (c <= 127)
+            {
+                return c == '\0' || IsAsciiPunctuation(c);
             }
             else
             {
                 var category = CharUnicodeInfo.GetUnicodeCategory(c);
-                return category == UnicodeCategory.SpaceSeparator
-                    || category == UnicodeCategory.LineSeparator
-                    || category == UnicodeCategory.ParagraphSeparator
-                    || category == UnicodeCategory.ConnectorPunctuation
+                return category == UnicodeCategory.ConnectorPunctuation
                     || category == UnicodeCategory.DashPunctuation
                     || category == UnicodeCategory.OpenPunctuation
                     || category == UnicodeCategory.ClosePunctuation
@@ -289,44 +318,16 @@ public static bool IsDigit(this char c)
         public static bool IsAsciiPunctuation(this char c)
         {
             // 2.1 Characters and lines 
-            // An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~.
-            switch (c)
-            {
-                case '!':
-                case '"':
-                case '#':
-                case '$':
-                case '%':
-                case '&':
-                case '\'':
-                case '(':
-                case ')':
-                case '*':
-                case '+':
-                case ',':
-                case '-':
-                case '.':
-                case '/':
-                case ':':
-                case ';':
-                case '<':
-                case '=':
-                case '>':
-                case '?':
-                case '@':
-                case '[':
-                case '\\':
-                case ']':
-                case '^':
-                case '_':
-                case '`':
-                case '{':
-                case '|':
-                case '}':
-                case '~':
-                    return true;
-            }
-            return false;
+            // An ASCII punctuation character is
+            // !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
+            // :, ;, <, =, >, ?, @ (U+003A–0040),
+            // [, \, ], ^, _, ` (U+005B–0060),
+            // {, |, }, or ~ (U+007B–007E).
+            return c <= 127 && (
+                IsInInclusiveRange(c, 33, 47) ||
+                IsInInclusiveRange(c, 58, 64) ||
+                IsInInclusiveRange(c, 91, 96) ||
+                IsInInclusiveRange(c, 123, 126));
         }
 
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
diff --git a/src/Markdig/Helpers/ThrowHelper.cs b/src/Markdig/Helpers/ThrowHelper.cs
index 6fab635de..0e088bc4a 100644
--- a/src/Markdig/Helpers/ThrowHelper.cs
+++ b/src/Markdig/Helpers/ThrowHelper.cs
@@ -55,7 +55,7 @@ internal static class ThrowHelper
         public static void ArgumentOutOfRangeException(string paramName) => throw new ArgumentOutOfRangeException(paramName);
 
         [DoesNotReturn]
-        public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(message, paramName);
+        public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(paramName, message);
 
         [DoesNotReturn]
         public static void ArgumentOutOfRangeException_index() => throw new ArgumentOutOfRangeException("index");
diff --git a/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs b/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs
index dcf927112..5bf02152e 100644
--- a/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs
+++ b/src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs
@@ -33,7 +33,7 @@ public EmphasisDescriptor(char character, int minimumCount, int maximumCount, bo
         /// <summary>
         /// The character of this emphasis.
         /// </summary>
-        public  char Character { get; }
+        public char Character { get; }
 
         /// <summary>
         /// The minimum number of character this emphasis is expected to have (must be >=1)