Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix string comparison with ordinal casing with Surrogates #55771

Merged
merged 2 commits into from
Jul 16, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,20 @@ public static IEnumerable<object[]> Compare_TestData()
yield return new object[] { new CultureInfo("de-DE_phoneb").CompareInfo, "\u00DC", "UE", CompareOptions.None, useNls ? 0 : -1 };
yield return new object[] { new CultureInfo("es-ES_tradnl").CompareInfo, "llegar", "lugar", CompareOptions.None, useNls ? 1 : -1 };
}

//
// Ordinal comparisons with ignore casing.
//

yield return new object[] { s_invariantCompare, "abcd", "abcd", CompareOptions.OrdinalIgnoreCase, 0};
yield return new object[] { s_invariantCompare, "abcd", "ABCD", CompareOptions.OrdinalIgnoreCase, 0};
yield return new object[] { s_invariantCompare, "Hello\u00F6", "HELLO\u00D6", CompareOptions.OrdinalIgnoreCase, 0};
yield return new object[] { s_invariantCompare, "Hello\uFE6A", "Hello\U0001F601", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
yield return new object[] { s_invariantCompare, "Hello\U0001F601", "Hello\uFE6A", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 1};
yield return new object[] { s_invariantCompare, "\uDBFF", "\uD800\uDC00", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
yield return new object[] { s_invariantCompare, "\uD800\uDC00", "\uDBFF", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 1};
yield return new object[] { s_invariantCompare, "abcdefg\uDBFF", "abcdefg\uD800\uDC00", CompareOptions.OrdinalIgnoreCase, useNls ? 1 : -1};
yield return new object[] { s_invariantCompare, "\U00010400", "\U00010428", CompareOptions.OrdinalIgnoreCase, useNls ? -1 : 0};
}

// There is a regression in Windows 190xx version with the Kana comparison. Avoid running this test there.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,15 @@ internal static void ToLower(ReadOnlySpan<char> source, Span<char> destination)
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static (uint, int) GetScalar(ref char charA, int index, int length)
private static (uint, int) GetScalar(ref char source, int index, int length)
{
char charA = source;
if (!char.IsHighSurrogate(charA) || index >= length - 1)
{
return ((uint)charA, 1);
}

ref char charB = ref Unsafe.Add(ref charA, 1);
char charB = Unsafe.Add(ref source, 1);
if (!char.IsLowSurrogate(charB))
{
return ((uint)charA, 1);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Text;
using System.Diagnostics;
using System.Threading;
using System.Runtime.InteropServices;
Expand Down Expand Up @@ -196,72 +197,90 @@ internal static int CompareStringIgnoreCase(ref char strA, int lengthA, ref char
ref char charA = ref strA;
ref char charB = ref strB;

while (length != 0)
int index = 0;

while (index < length)
{
// optimize for Ascii cases
if (charA <= '\u00FF' || length == 1 || !char.IsHighSurrogate(charA) || !char.IsHighSurrogate(charB))
char a = charA;
char b = charB;

if (!char.IsHighSurrogate(a) || index >= lengthA - 1 || !char.IsLowSurrogate(Unsafe.Add(ref charA, 1)))
tarekgh marked this conversation as resolved.
Show resolved Hide resolved
{
if (charA == charB)
if (!char.IsHighSurrogate(b) || index >= lengthB - 1 || !char.IsLowSurrogate(Unsafe.Add(ref charB, 1)))
safern marked this conversation as resolved.
Show resolved Hide resolved
{
length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
}
//
// Neither A or B are surrogates
//

char aUpper = OrdinalCasing.ToUpper(charA);
char bUpper = OrdinalCasing.ToUpper(charB);
if (b == a)
{
index++;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
}

if (aUpper == bUpper)
{
length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
}
char aUpper = OrdinalCasing.ToUpper(a);
char bUpper = OrdinalCasing.ToUpper(b);

return aUpper - bUpper;
}
if (aUpper == bUpper)
{
index++;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
continue;
}

// We come here only of we have valid high surrogates and length > 1
return a - b;
}

char a = charA;
char b = charB;
//
// charA is not surrogate and charB is valid surrogate
//

return -1;
}

length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
//
// A is Surrogate
//

if (!char.IsLowSurrogate(charA) || !char.IsLowSurrogate(charB))
if (!char.IsHighSurrogate(b) || index >= lengthB - 1 || !char.IsLowSurrogate(Unsafe.Add(ref charB, 1)))
{
// malformed Surrogates - should be rare cases
if (a != b)
{
return a - b;
}
//
// charB is not surrogate and charA is surrogate
//

// Should be pointing to the right characters in the string to resume at.
// Just in case we could be pointing at high surrogate now.
continue;
return 1;
}

// we come here only if we have valid full surrogates
SurrogateCasing.ToUpper(a, charA, out char h1, out char l1);
SurrogateCasing.ToUpper(b, charB, out char h2, out char l2);
//
// charA and charB are surrogates
//

char lowSurrogateA = Unsafe.Add(ref charA, 1);
char lowSurrogateB = Unsafe.Add(ref charB, 1);

if (h1 != h2)
if (a == b && lowSurrogateA == lowSurrogateB)
{
return (int)h1 - (int)h2;
index += 2;
charA = ref Unsafe.Add(ref charA, 2);
charB = ref Unsafe.Add(ref charB, 2);
continue;
}

if (l1 != l2)
uint upperSurrogateA = CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(a, lowSurrogateA));
uint upperSurrogateB = CharUnicodeInfo.ToUpper(UnicodeUtility.GetScalarFromUtf16SurrogatePair(b, lowSurrogateB));

if (upperSurrogateA == upperSurrogateB)
{
return (int)l1 - (int)l2;
index += 2;
charA = ref Unsafe.Add(ref charA, 2);
charB = ref Unsafe.Add(ref charB, 2);
continue;
}

length--;
charA = ref Unsafe.Add(ref charA, 1);
charB = ref Unsafe.Add(ref charB, 1);
return (int)upperSurrogateA - (int)upperSurrogateB;
}

return lengthA - lengthB;
Expand Down