Skip to content

Commit

Permalink
Merge pull request #649 from MihaZupan/commonmark-whitespace-punctuation
Browse files Browse the repository at this point in the history
Align Whitespace and Punctuation definitions with CommonMark
  • Loading branch information
xoofx authored Aug 12, 2022
2 parents 1f71520 + 0e26ec5 commit bce4b70
Show file tree
Hide file tree
Showing 5 changed files with 157 additions and 66 deletions.
92 changes: 92 additions & 0 deletions src/Markdig.Tests/TestCharHelper.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
using System.Collections.Generic;
using System.Globalization;
using Markdig.Helpers;
using NUnit.Framework;

namespace Markdig.Tests
{
public class TestCharHelper
{
// An ASCII punctuation character is
// !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
// :, ;, <, =, >, ?, @ (U+003A–0040),
// [, \, ], ^, _, ` (U+005B–0060),
// {, |, }, or ~ (U+007B–007E).
private static readonly HashSet<char> s_asciiPunctuation = new()
{
'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
':', ';', '<', '=', '>', '?', '@',
'[', '\\', ']', '^', '_', '`',
'{', '|', '}', '~'
};

// A Unicode punctuation character is an ASCII punctuation character or anything in the general Unicode categories
// Pc, Pd, Pe, Pf, Pi, Po, or Ps.
private static readonly HashSet<UnicodeCategory> s_punctuationCategories = new()
{
UnicodeCategory.ConnectorPunctuation,
UnicodeCategory.DashPunctuation,
UnicodeCategory.ClosePunctuation,
UnicodeCategory.FinalQuotePunctuation,
UnicodeCategory.InitialQuotePunctuation,
UnicodeCategory.OtherPunctuation,
UnicodeCategory.OpenPunctuation
};

private static bool ExpectedIsPunctuation(char c)
{
return c <= 127
? s_asciiPunctuation.Contains(c)
: s_punctuationCategories.Contains(CharUnicodeInfo.GetUnicodeCategory(c));
}

private static bool ExpectedIsWhitespace(char c)
{
// A Unicode whitespace character is any code point in the Unicode Zs general category,
// or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
return c == '\t' || c == '\n' || c == '\u000C' || c == '\r' ||
CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
}

[Test]
public void IsWhitespace()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;

Assert.AreEqual(ExpectedIsWhitespace(c), CharHelper.IsWhitespace(c));
}
}

[Test]
public void CheckUnicodeCategory()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;

bool expectedSpace = c == 0 || ExpectedIsWhitespace(c);
bool expectedPunctuation = c == 0 || ExpectedIsPunctuation(c);

CharHelper.CheckUnicodeCategory(c, out bool spaceActual, out bool punctuationActual);

Assert.AreEqual(expectedSpace, spaceActual);
Assert.AreEqual(expectedPunctuation, punctuationActual);
}
}

[Test]
public void IsSpaceOrPunctuation()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;

bool expected = c == 0 || ExpectedIsWhitespace(c) || ExpectedIsPunctuation(c);

Assert.AreEqual(expected, CharHelper.IsSpaceOrPunctuation(c));
}
}
}
}
2 changes: 0 additions & 2 deletions src/Markdig.Tests/TestPipeTable.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ public sealed class TestPipeTable
{
[TestCase("| S | T |\r\n|---|---| \r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\t\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\v\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\f\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\f\v\t \r\n| G | H |")]
[TestCase("| S | \r\n|---|\r\n| G |\r\n\r\n| D | D |\r\n| ---| ---| \r\n| V | V |", 2)]
public void TestTableBug(string markdown, int tableCount = 1)
{
Expand Down
125 changes: 63 additions & 62 deletions src/Markdig/Helpers/CharHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWor

// A right-flanking delimiter run is a delimiter run that is
// (1) not preceded by Unicode whitespace, and either
// (1a) not preceded by a punctuation character, or
// (2a) not preceded by a punctuation character, or
// (2b) preceded by a punctuation character and followed by Unicode whitespace or a punctuation character.
// For purposes of this definition, the beginning and the end of the line count as Unicode whitespace.
canClose = !prevIsWhiteSpace &&
Expand Down Expand Up @@ -144,9 +144,37 @@ public static bool Contains(this char[] charList, char c)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsWhitespace(this char c)
{
// 2.1 Characters and lines
// A whitespace character is a space(U + 0020), tab(U + 0009), newline(U + 000A), line tabulation (U + 000B), form feed (U + 000C), or carriage return (U + 000D).
return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r');
// 2.1 Characters and lines
// A Unicode whitespace character is any code point in the Unicode Zs general category,
// or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
if (c <= ' ')
{
const long Mask =
(1L << ' ') |
(1L << '\t') |
(1L << '\n') |
(1L << '\f') |
(1L << '\r');

return (Mask & (1L << c)) != 0;
}

return c >= '\u00A0' && IsWhitespaceRare(c);

static bool IsWhitespaceRare(char c)
{
// return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;

if (c < 5760)
{
return c == '\u00A0';
}
else
{
return c <= 12288 &&
(c == 5760 || IsInInclusiveRange(c, 8192, 8202) || c == 8239 || c == 8287 || c == 12288);
}
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand All @@ -171,46 +199,47 @@ public static bool IsWhiteSpaceOrZero(this char c)
// Check if a char is a space or a punctuation
public static void CheckUnicodeCategory(this char c, out bool space, out bool punctuation)
{
// Credits: code from CommonMark.NET
// Copyright (c) 2014, Kārlis Gaņģis All rights reserved.
// See license for details: https://github.com/Knagis/CommonMark.NET/blob/master/LICENSE.md
if (c <= 'ÿ')
if (IsWhitespace(c))
{
space = true;
punctuation = false;
}
else if (c <= 127)
{
space = c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085';
punctuation = c == '\0' || (c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
space = c == '\0';
punctuation = c == '\0' || IsAsciiPunctuation(c);
}
else
{
var category = CharUnicodeInfo.GetUnicodeCategory(c);
space = category == UnicodeCategory.SpaceSeparator
|| category == UnicodeCategory.LineSeparator
|| category == UnicodeCategory.ParagraphSeparator;
punctuation = !space &&
(category == UnicodeCategory.ConnectorPunctuation
// A Unicode punctuation character is an ASCII punctuation character
// or anything in the general Unicode categories Pc, Pd, Pe, Pf, Pi, Po, or Ps.
space = false;
UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(c);
punctuation = category == UnicodeCategory.ConnectorPunctuation
|| category == UnicodeCategory.DashPunctuation
|| category == UnicodeCategory.OpenPunctuation
|| category == UnicodeCategory.ClosePunctuation
|| category == UnicodeCategory.InitialQuotePunctuation
|| category == UnicodeCategory.FinalQuotePunctuation
|| category == UnicodeCategory.OtherPunctuation);
|| category == UnicodeCategory.OtherPunctuation;
}
}

// Same as CheckUnicodeCategory
internal static bool IsSpaceOrPunctuation(this char c)
{
if (c <= 'ÿ')
if (IsWhitespace(c))
{
return c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085' ||
(c >= 33 && c <= 47 && c != 38) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
return true;
}
else if (c <= 127)
{
return c == '\0' || IsAsciiPunctuation(c);
}
else
{
var category = CharUnicodeInfo.GetUnicodeCategory(c);
return category == UnicodeCategory.SpaceSeparator
|| category == UnicodeCategory.LineSeparator
|| category == UnicodeCategory.ParagraphSeparator
|| category == UnicodeCategory.ConnectorPunctuation
return category == UnicodeCategory.ConnectorPunctuation
|| category == UnicodeCategory.DashPunctuation
|| category == UnicodeCategory.OpenPunctuation
|| category == UnicodeCategory.ClosePunctuation
Expand Down Expand Up @@ -289,44 +318,16 @@ public static bool IsDigit(this char c)
public static bool IsAsciiPunctuation(this char c)
{
// 2.1 Characters and lines
// An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~.
switch (c)
{
case '!':
case '"':
case '#':
case '$':
case '%':
case '&':
case '\'':
case '(':
case ')':
case '*':
case '+':
case ',':
case '-':
case '.':
case '/':
case ':':
case ';':
case '<':
case '=':
case '>':
case '?':
case '@':
case '[':
case '\\':
case ']':
case '^':
case '_':
case '`':
case '{':
case '|':
case '}':
case '~':
return true;
}
return false;
// An ASCII punctuation character is
// !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
// :, ;, <, =, >, ?, @ (U+003A–0040),
// [, \, ], ^, _, ` (U+005B–0060),
// {, |, }, or ~ (U+007B–007E).
return c <= 127 && (
IsInInclusiveRange(c, 33, 47) ||
IsInInclusiveRange(c, 58, 64) ||
IsInInclusiveRange(c, 91, 96) ||
IsInInclusiveRange(c, 123, 126));
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down
2 changes: 1 addition & 1 deletion src/Markdig/Helpers/ThrowHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ internal static class ThrowHelper
public static void ArgumentOutOfRangeException(string paramName) => throw new ArgumentOutOfRangeException(paramName);

[DoesNotReturn]
public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(message, paramName);
public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(paramName, message);

[DoesNotReturn]
public static void ArgumentOutOfRangeException_index() => throw new ArgumentOutOfRangeException("index");
Expand Down
2 changes: 1 addition & 1 deletion src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public EmphasisDescriptor(char character, int minimumCount, int maximumCount, bo
/// <summary>
/// The character of this emphasis.
/// </summary>
public char Character { get; }
public char Character { get; }

/// <summary>
/// The minimum number of character this emphasis is expected to have (must be >=1)
Expand Down

0 comments on commit bce4b70

Please sign in to comment.