Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Invariant Mode Case Mapping #55520

Merged
merged 10 commits into from
Jul 15, 2021
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public CategoryCasingInfo(CodePoint codePoint)
break;
}

if (Program.IncludeCasingData)
if (Program.IncludeCasingData && codePoint.Value != 0x0130 && codePoint.Value != 0x0131 && codePoint.Value != 0x017f)
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
{
_data.offsetToSimpleUppercase = (ushort)(codePoint.SimpleUppercaseMapping - codePoint.Value);
_data.offsetToSimpleLowercase = (ushort)(codePoint.SimpleLowercaseMapping - codePoint.Value);
Expand Down
2,424 changes: 2,424 additions & 0 deletions src/coreclr/System.Private.CoreLib/Tools/GenUnicodeProp/CharUnicodeInfoData.cs

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ namespace GenUnicodeProp
{
internal static class Program
{
internal static bool Verbose = false;
internal static bool IncludeCasingData = false;
internal static bool Verbose;
internal static bool IncludeCasingData;

private const string SOURCE_NAME = "CharUnicodeInfoData.cs";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ internal CodePoint(int value, ParsedUnicodeData parsedData)
/// <remarks>
/// See https://www.unicode.org/reports/tr44/#PropList.txt.
/// </remarks>
public CodePointFlags Flags { get; } = default; // default is "no flags"
public CodePointFlags Flags { get; } // default is "no flags"

/// <summary>
/// The general Unicode category of this code point.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ public static IEnumerable<object[]> IndexOf_TestData()
yield return new object[] { "Hello", "L", 0, 5, CompareOptions.OrdinalIgnoreCase, 2 };
yield return new object[] { "Hello", "h", 0, 5, CompareOptions.OrdinalIgnoreCase, 0 };

yield return new object[] { "Hello\u00D3\u00D4", "\u00F3\u00F4", 0, 7, CompareOptions.OrdinalIgnoreCase, 5 };
yield return new object[] { "Hello\u00D3\u00D4", "\u00F3\u00F5", 0, 7, CompareOptions.OrdinalIgnoreCase, -1 };

yield return new object[] { "Hello\U00010400", "\U00010428", 0, 7, CompareOptions.OrdinalIgnoreCase, 5 };


// Long strings
yield return new object[] { new string('b', 100) + new string('a', 5555), "aaaaaaaaaaaaaaa", 0, 5655, CompareOptions.None, 100 };
yield return new object[] { new string('b', 101) + new string('a', 5555), new string('a', 5000), 0, 5656, CompareOptions.None, 101 };
Expand Down Expand Up @@ -159,6 +165,12 @@ public static IEnumerable<object[]> LastIndexOf_TestData()
yield return new object[] { "Hello", "L", 4, 5, CompareOptions.OrdinalIgnoreCase, 3 };
yield return new object[] { "Hello", "h", 4, 5, CompareOptions.OrdinalIgnoreCase, 0 };


yield return new object[] { "Hello\u00D3\u00D4\u00D3\u00D4", "\u00F3\u00F4", 8, 9, CompareOptions.OrdinalIgnoreCase, 7 };
yield return new object[] { "Hello\u00D3\u00D4\u00D3\u00D4", "\u00F3\u00F5", 8, 9, CompareOptions.OrdinalIgnoreCase, -1 };

yield return new object[] { "Hello\U00010400\U00010400", "\U00010428", 8, 9, CompareOptions.OrdinalIgnoreCase, 7 };

// Long strings
yield return new object[] { new string('a', 5555) + new string('b', 100), "aaaaaaaaaaaaaaa", 5654, 5655, CompareOptions.None, 5540 };
yield return new object[] { new string('b', 101) + new string('a', 5555), new string('a', 5000), 5655, 5656, CompareOptions.None, 656 };
Expand Down Expand Up @@ -237,6 +249,10 @@ public static IEnumerable<object[]> IsPrefix_TestData()
yield return new object[] { "FooBar", "Foo\u0400Bar", CompareOptions.Ordinal, false };
yield return new object[] { "FooBA\u0300R", "FooB\u00C0R", CompareOptions.IgnoreNonSpace, false };

yield return new object[] { "\u00D3\u00D4\u00D3\u00D4Hello", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, true };
yield return new object[] { "\u00D3\u00D4Hello\u00D3\u00D4", "\u00F3\u00F5", CompareOptions.OrdinalIgnoreCase, false };
yield return new object[] { "\U00010400\U00010400Hello", "\U00010428", CompareOptions.OrdinalIgnoreCase, true };

// Ignore symbols
yield return new object[] { "Test's can be interesting", "Tests", CompareOptions.IgnoreSymbols, false };
yield return new object[] { "Test's can be interesting", "Tests", CompareOptions.None, false };
Expand Down Expand Up @@ -277,6 +293,11 @@ public static IEnumerable<object[]> IsSuffix_TestData()
yield return new object[] { "FooBar", "Foo\u0400Bar", CompareOptions.Ordinal, false };
yield return new object[] { "FooBA\u0300R", "FooB\u00C0R", CompareOptions.IgnoreNonSpace, false };

yield return new object[] { "\u00D3\u00D4\u00D3\u00D4Hello", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, false };
yield return new object[] { "\u00D3\u00D4Hello\u00D3\u00D4", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, true };
yield return new object[] { "\U00010400\U00010400Hello", "\U00010428", CompareOptions.OrdinalIgnoreCase, false };
yield return new object[] { "Hello\U00010400", "\U00010428", CompareOptions.OrdinalIgnoreCase, true };

// Weightless characters
yield return new object[] { "", "\u200d", CompareOptions.None, false };
yield return new object[] { "", "\u200d", CompareOptions.IgnoreCase, false };
Expand Down Expand Up @@ -327,6 +348,21 @@ public static IEnumerable<object[]> Compare_TestData()

yield return new object[] { "", "'", CompareOptions.None, -1 };

yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, 0 };
yield return new object[] { "\U00010400", "\U00010428", CompareOptions.OrdinalIgnoreCase, 0 };
yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4", CompareOptions.IgnoreCase, 0 };
yield return new object[] { "\U00010400", "\U00010428", CompareOptions.IgnoreCase, 0 };

yield return new object[] { "\u00D3\u00D4G", "\u00F3\u00F4", CompareOptions.OrdinalIgnoreCase, 1 };
yield return new object[] { "\U00010400G", "\U00010428", CompareOptions.OrdinalIgnoreCase, 1 };
yield return new object[] { "\u00D3\u00D4G", "\u00F3\u00F4", CompareOptions.IgnoreCase, 1 };
yield return new object[] { "\U00010400G", "\U00010428", CompareOptions.IgnoreCase, 1 };

yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4G", CompareOptions.OrdinalIgnoreCase, -1 };
yield return new object[] { "\U00010400", "\U00010428G", CompareOptions.OrdinalIgnoreCase, -1 };
yield return new object[] { "\u00D3\u00D4", "\u00F3\u00F4G", CompareOptions.IgnoreCase, -1 };
yield return new object[] { "\U00010400", "\U00010428G", CompareOptions.IgnoreCase, -1 };

// Hungarian
yield return new object[] { "dzsdzs", "ddzs", CompareOptions.Ordinal, 1 };
yield return new object[] { "dzsdzs", "ddzs", CompareOptions.None, 1 };
Expand Down Expand Up @@ -375,21 +411,24 @@ public static IEnumerable<object[]> ToLower_TestData()
yield return new object[] { "EMBEDDED\0NuLL\0Byte\0", "embedded\0null\0byte\0", true };

// LATIN CAPITAL LETTER O WITH ACUTE, which has a lower case variant.
yield return new object[] { "\u00D3", "\u00F3", false };
yield return new object[] { "\u00D3", "\u00F3", true };

// SNOWMAN, which does not have a lower case variant.
yield return new object[] { "\u2603", "\u2603", true };

// RAINBOW (outside the BMP and does not case)
yield return new object[] { "\U0001F308", "\U0001F308", true };

// Surrogate casing
yield return new object[] { "\U00010400", "\U00010428", true };

// Unicode defines some codepoints which expand into multiple codepoints
// when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
// these sorts of expansions, since it would cause string lengths to change when cased,
// which is non-intuitive. In addition, there are some context sensitive mappings which
// we also don't preform.
// Greek Capital Letter Sigma (does not to case to U+03C2 with "final sigma" rule).
yield return new object[] { "\u03A3", "\u03C3", false };
yield return new object[] { "\u03A3", "\u03C3", true };
}

public static IEnumerable<object[]> ToUpper_TestData()
Expand All @@ -416,14 +455,17 @@ public static IEnumerable<object[]> ToUpper_TestData()
yield return new object[] { "embedded\0NuLL\0Byte\0", "EMBEDDED\0NULL\0BYTE\0", true };

// LATIN SMALL LETTER O WITH ACUTE, which has an upper case variant.
yield return new object[] { "\u00F3", "\u00D3", false };
yield return new object[] { "\u00F3", "\u00D3", true };

// SNOWMAN, which does not have an upper case variant.
yield return new object[] { "\u2603", "\u2603", true };

// RAINBOW (outside the BMP and does not case)
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
yield return new object[] { "\U0001F308", "\U0001F308", true };

// Surrogate casing
yield return new object[] { "\U00010428", "\U00010400", true };

// Unicode defines some codepoints which expand into multiple codepoints
// when cased (see SpecialCasing.txt from UNIDATA for some examples). We have never done
// these sorts of expansions, since it would cause string lengths to change when cased,
Expand All @@ -439,7 +481,7 @@ public static IEnumerable<object[]> ToUpper_TestData()
// as part of casing.
yield return new object[] { "\u0149", "\u0149", true };

yield return new object[] { "\u03C3", "\u03A3", false };
yield return new object[] { "\u03C3", "\u03A3", true };
}

public static IEnumerable<object[]> GetAscii_TestData()
Expand Down Expand Up @@ -722,7 +764,7 @@ public unsafe void TestGetSortKeyLength_OverlongArgument(int inputLength)
[InlineData("Hello", CompareOptions.IgnoreCase, "HELLO")]
[InlineData("Hello", CompareOptions.IgnoreCase | CompareOptions.IgnoreWidth, "HELLO")]
[InlineData("Hell\u00F6", CompareOptions.None, "Hell\u00F6")] // U+00F6 = LATIN SMALL LETTER O WITH DIAERESIS
[InlineData("Hell\u00F6", CompareOptions.IgnoreCase, "HELL\u00F6")] // note the final "o with diaeresis" isn't capitalized
[InlineData("Hell\u00F6", CompareOptions.IgnoreCase, "HELL\u00D6")]
public unsafe void TestSortKey_FromSpan(string input, CompareOptions options, string expected)
{
byte[] expectedOutputBytes = GetExpectedInvariantOrdinalSortKey(expected);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\IdnMapping.Icu.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\IdnMapping.Nls.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\InternalGlobalizationHelper.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\InvariantModeCasing.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\ISOWeek.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\JapaneseCalendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\JapaneseCalendar.Icu.cs" />
Expand All @@ -363,6 +364,7 @@
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\SortVersion.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\StringInfo.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\StrongBidiCategory.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\SurrogateCasing.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\TaiwanCalendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\TaiwanLunisolarCalendar.cs" />
<Compile Include="$(MSBuildThisFileDirectory)System\Globalization\TextElementEnumerator.cs" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,34 @@ private static double GetNumericValueNoBoundsCheck(uint codePoint)
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char ToUpper(uint codePoint)
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);

// If the offset is specified in shorts:
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add

ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(UppercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (int)offset);
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (char)(delta + (int)codePoint);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static char ToLower(uint codePoint)
{
nuint offset = GetCategoryCasingTableOffsetNoBoundsChecks(codePoint);

// If the offset is specified in shorts:
// Get the 'ref short' corresponding to where the addend is, read it as a signed 16-bit value, then add

ref short rsStart = ref Unsafe.As<byte, short>(ref MemoryMarshal.GetReference(LowercaseValues));
ref short rsDelta = ref Unsafe.Add(ref rsStart, (int)offset);
int delta = (BitConverter.IsLittleEndian) ? rsDelta : BinaryPrimitives.ReverseEndianness(rsDelta);
return (char)(delta + (int)codePoint);
}

/*
* GetUnicodeCategory
* ==================
Expand Down
Loading