Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align Whitespace and Punctuation definitions with CommonMark #649

Merged
merged 1 commit into from
Aug 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 92 additions & 0 deletions src/Markdig.Tests/TestCharHelper.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
using System.Collections.Generic;
using System.Globalization;
using Markdig.Helpers;
using NUnit.Framework;

namespace Markdig.Tests
{
public class TestCharHelper
{
// An ASCII punctuation character is
// !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
// :, ;, <, =, >, ?, @ (U+003A–0040),
// [, \, ], ^, _, ` (U+005B–0060),
// {, |, }, or ~ (U+007B–007E).
private static readonly HashSet<char> s_asciiPunctuation = new()
{
'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/',
':', ';', '<', '=', '>', '?', '@',
'[', '\\', ']', '^', '_', '`',
'{', '|', '}', '~'
};

// A Unicode punctuation character is an ASCII punctuation character or anything in the general Unicode categories
// Pc, Pd, Pe, Pf, Pi, Po, or Ps.
private static readonly HashSet<UnicodeCategory> s_punctuationCategories = new()
{
UnicodeCategory.ConnectorPunctuation,
UnicodeCategory.DashPunctuation,
UnicodeCategory.ClosePunctuation,
UnicodeCategory.FinalQuotePunctuation,
UnicodeCategory.InitialQuotePunctuation,
UnicodeCategory.OtherPunctuation,
UnicodeCategory.OpenPunctuation
};

private static bool ExpectedIsPunctuation(char c)
{
return c <= 127
? s_asciiPunctuation.Contains(c)
: s_punctuationCategories.Contains(CharUnicodeInfo.GetUnicodeCategory(c));
}

private static bool ExpectedIsWhitespace(char c)
{
// A Unicode whitespace character is any code point in the Unicode Zs general category,
// or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
return c == '\t' || c == '\n' || c == '\u000C' || c == '\r' ||
CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;
}

[Test]
public void IsWhitespace()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;

Assert.AreEqual(ExpectedIsWhitespace(c), CharHelper.IsWhitespace(c));
}
}

[Test]
public void CheckUnicodeCategory()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;

bool expectedSpace = c == 0 || ExpectedIsWhitespace(c);
bool expectedPunctuation = c == 0 || ExpectedIsPunctuation(c);

CharHelper.CheckUnicodeCategory(c, out bool spaceActual, out bool punctuationActual);

Assert.AreEqual(expectedSpace, spaceActual);
Assert.AreEqual(expectedPunctuation, punctuationActual);
}
}

[Test]
public void IsSpaceOrPunctuation()
{
for (int i = char.MinValue; i <= char.MaxValue; i++)
{
char c = (char)i;

bool expected = c == 0 || ExpectedIsWhitespace(c) || ExpectedIsPunctuation(c);

Assert.AreEqual(expected, CharHelper.IsSpaceOrPunctuation(c));
}
}
}
}
2 changes: 0 additions & 2 deletions src/Markdig.Tests/TestPipeTable.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ public sealed class TestPipeTable
{
[TestCase("| S | T |\r\n|---|---| \r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\t\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\v\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\f\r\n| G | H |")]
[TestCase("| S | T |\r\n|---|---|\f\v\t \r\n| G | H |")]
[TestCase("| S | \r\n|---|\r\n| G |\r\n\r\n| D | D |\r\n| ---| ---| \r\n| V | V |", 2)]
public void TestTableBug(string markdown, int tableCount = 1)
{
Expand Down
125 changes: 63 additions & 62 deletions src/Markdig/Helpers/CharHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ public static void CheckOpenCloseDelimiter(char pc, char c, bool enableWithinWor

// A right-flanking delimiter run is a delimiter run that is
// (1) not preceded by Unicode whitespace, and either
// (1a) not preceded by a punctuation character, or
// (2a) not preceded by a punctuation character, or
// (2b) preceded by a punctuation character and followed by Unicode whitespace or a punctuation character.
// For purposes of this definition, the beginning and the end of the line count as Unicode whitespace.
canClose = !prevIsWhiteSpace &&
Expand Down Expand Up @@ -144,9 +144,37 @@ public static bool Contains(this char[] charList, char c)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static bool IsWhitespace(this char c)
{
// 2.1 Characters and lines
// A whitespace character is a space(U + 0020), tab(U + 0009), newline(U + 000A), line tabulation (U + 000B), form feed (U + 000C), or carriage return (U + 000D).
return c <= ' ' && (c == ' ' || c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r');
// 2.1 Characters and lines
// A Unicode whitespace character is any code point in the Unicode Zs general category,
// or a tab (U+0009), line feed (U+000A), form feed (U+000C), or carriage return (U+000D).
if (c <= ' ')
{
const long Mask =
(1L << ' ') |
(1L << '\t') |
(1L << '\n') |
(1L << '\f') |
(1L << '\r');

return (Mask & (1L << c)) != 0;
}

return c >= '\u00A0' && IsWhitespaceRare(c);

static bool IsWhitespaceRare(char c)
{
// return CharUnicodeInfo.GetUnicodeCategory(c) == UnicodeCategory.SpaceSeparator;

if (c < 5760)
{
return c == '\u00A0';
}
else
{
return c <= 12288 &&
(c == 5760 || IsInInclusiveRange(c, 8192, 8202) || c == 8239 || c == 8287 || c == 12288);
}
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand All @@ -171,46 +199,47 @@ public static bool IsWhiteSpaceOrZero(this char c)
// Check if a char is a space or a punctuation
public static void CheckUnicodeCategory(this char c, out bool space, out bool punctuation)
{
// Credits: code from CommonMark.NET
// Copyright (c) 2014, Kārlis Gaņģis All rights reserved.
// See license for details: https://github.com/Knagis/CommonMark.NET/blob/master/LICENSE.md
if (c <= 'ÿ')
if (IsWhitespace(c))
{
space = true;
punctuation = false;
}
else if (c <= 127)
{
space = c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085';
punctuation = c == '\0' || (c >= 33 && c <= 47) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
space = c == '\0';
punctuation = c == '\0' || IsAsciiPunctuation(c);
}
else
{
var category = CharUnicodeInfo.GetUnicodeCategory(c);
space = category == UnicodeCategory.SpaceSeparator
|| category == UnicodeCategory.LineSeparator
|| category == UnicodeCategory.ParagraphSeparator;
punctuation = !space &&
(category == UnicodeCategory.ConnectorPunctuation
// A Unicode punctuation character is an ASCII punctuation character
// or anything in the general Unicode categories Pc, Pd, Pe, Pf, Pi, Po, or Ps.
space = false;
UnicodeCategory category = CharUnicodeInfo.GetUnicodeCategory(c);
punctuation = category == UnicodeCategory.ConnectorPunctuation
|| category == UnicodeCategory.DashPunctuation
|| category == UnicodeCategory.OpenPunctuation
|| category == UnicodeCategory.ClosePunctuation
|| category == UnicodeCategory.InitialQuotePunctuation
|| category == UnicodeCategory.FinalQuotePunctuation
|| category == UnicodeCategory.OtherPunctuation);
|| category == UnicodeCategory.OtherPunctuation;
}
}

// Same as CheckUnicodeCategory
internal static bool IsSpaceOrPunctuation(this char c)
{
if (c <= 'ÿ')
if (IsWhitespace(c))
{
return c == '\0' || c == ' ' || (c >= '\t' && c <= '\r') || c == '\u00a0' || c == '\u0085' ||
(c >= 33 && c <= 47 && c != 38) || (c >= 58 && c <= 64) || (c >= 91 && c <= 96) || (c >= 123 && c <= 126);
return true;
}
else if (c <= 127)
{
return c == '\0' || IsAsciiPunctuation(c);
}
else
{
var category = CharUnicodeInfo.GetUnicodeCategory(c);
return category == UnicodeCategory.SpaceSeparator
|| category == UnicodeCategory.LineSeparator
|| category == UnicodeCategory.ParagraphSeparator
|| category == UnicodeCategory.ConnectorPunctuation
return category == UnicodeCategory.ConnectorPunctuation
|| category == UnicodeCategory.DashPunctuation
|| category == UnicodeCategory.OpenPunctuation
|| category == UnicodeCategory.ClosePunctuation
Expand Down Expand Up @@ -289,44 +318,16 @@ public static bool IsDigit(this char c)
public static bool IsAsciiPunctuation(this char c)
{
// 2.1 Characters and lines
// An ASCII punctuation character is !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \, ], ^, _, `, {, |, }, or ~.
switch (c)
{
case '!':
case '"':
case '#':
case '$':
case '%':
case '&':
case '\'':
case '(':
case ')':
case '*':
case '+':
case ',':
case '-':
case '.':
case '/':
case ':':
case ';':
case '<':
case '=':
case '>':
case '?':
case '@':
case '[':
case '\\':
case ']':
case '^':
case '_':
case '`':
case '{':
case '|':
case '}':
case '~':
return true;
}
return false;
// An ASCII punctuation character is
// !, ", #, $, %, &, ', (, ), *, +, ,, -, ., / (U+0021–2F),
// :, ;, <, =, >, ?, @ (U+003A–0040),
// [, \, ], ^, _, ` (U+005B–0060),
// {, |, }, or ~ (U+007B–007E).
return c <= 127 && (
IsInInclusiveRange(c, 33, 47) ||
IsInInclusiveRange(c, 58, 64) ||
IsInInclusiveRange(c, 91, 96) ||
IsInInclusiveRange(c, 123, 126));
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
Expand Down
2 changes: 1 addition & 1 deletion src/Markdig/Helpers/ThrowHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ internal static class ThrowHelper
public static void ArgumentOutOfRangeException(string paramName) => throw new ArgumentOutOfRangeException(paramName);

[DoesNotReturn]
public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(message, paramName);
public static void ArgumentOutOfRangeException(string message, string paramName) => throw new ArgumentOutOfRangeException(paramName, message);

[DoesNotReturn]
public static void ArgumentOutOfRangeException_index() => throw new ArgumentOutOfRangeException("index");
Expand Down
2 changes: 1 addition & 1 deletion src/Markdig/Parsers/Inlines/EmphasisDescriptor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ public EmphasisDescriptor(char character, int minimumCount, int maximumCount, bo
/// <summary>
/// The character of this emphasis.
/// </summary>
public char Character { get; }
public char Character { get; }

/// <summary>
/// The minimum number of character this emphasis is expected to have (must be >=1)
Expand Down