From 19406aa0919d606187c860cc02990b3bd3ebceac Mon Sep 17 00:00:00 2001 From: Michael Staib Date: Fri, 1 Sep 2023 15:34:32 +0200 Subject: [PATCH] Handle UTF8 Surrogate Pairs in Parser (#6503) --- .../Language/src/Language.Utf8/Utf8Helper.cs | 115 +++++++++--------- .../Parser/ParseUtf8SurrogatePairTests.cs | 19 +++ 2 files changed, 74 insertions(+), 60 deletions(-) create mode 100644 src/HotChocolate/Language/test/Language.Tests/Parser/ParseUtf8SurrogatePairTests.cs diff --git a/src/HotChocolate/Language/src/Language.Utf8/Utf8Helper.cs b/src/HotChocolate/Language/src/Language.Utf8/Utf8Helper.cs index 72a87e8637a..b488a8c513c 100644 --- a/src/HotChocolate/Language/src/Language.Utf8/Utf8Helper.cs +++ b/src/HotChocolate/Language/src/Language.Utf8/Utf8Helper.cs @@ -6,10 +6,6 @@ namespace HotChocolate.Language; internal static class Utf8Helper { - private const int _utf8TwoByteMask = 0b1100_0000_1000_0000; - private const int _utf8ThreeByteMask = 0b1110_0000_1000_0000_1000_0000; - private const int _shiftBytesMask = 0b1111_1111_1100_0000; - public static void Unescape( in ReadOnlySpan escapedString, ref Span unescapedString, @@ -18,6 +14,7 @@ public static void Unescape( var readPosition = -1; var writePosition = 0; var eofPosition = escapedString.Length - 1; + int? highSurrogate = null; if (escapedString.Length > 0) { @@ -29,32 +26,46 @@ public static void Unescape( { code = escapedString[++readPosition]; - if (isBlockString && code == GraphQLConstants.Quote) - { - if (escapedString[readPosition + 1] == GraphQLConstants.Quote - && escapedString[readPosition + 2] == GraphQLConstants.Quote) - { - readPosition += 2; - unescapedString[writePosition++] = GraphQLConstants.Quote; - unescapedString[writePosition++] = GraphQLConstants.Quote; - unescapedString[writePosition++] = GraphQLConstants.Quote; - } - else - { - throw new Utf8EncodingException(Utf8Helper_InvalidQuoteEscapeCount); - } - } - else if (code.IsValidEscapeCharacter()) + if (code.IsValidEscapeCharacter()) { if (code == GraphQLConstants.U) { - UnescapeUtf8Hex( + var unicodeDecimal = UnescapeUtf8Hex( escapedString[++readPosition], escapedString[++readPosition], escapedString[++readPosition], - escapedString[++readPosition], - ref writePosition, - unescapedString); + escapedString[++readPosition]); + + if (unicodeDecimal >= 0xD800 && unicodeDecimal <= 0xDBFF) + { + // High surrogate + if (highSurrogate != null) + { + throw new Utf8EncodingException("Unexpected high surrogate."); + } + highSurrogate = unicodeDecimal; + } + else if (unicodeDecimal >= 0xDC00 && unicodeDecimal <= 0xDFFF) + { + // Low surrogate + if (highSurrogate == null) + { + throw new Utf8EncodingException("Unexpected low surrogate."); + } + var fullUnicode = ((highSurrogate.Value - 0xD800) << 10) + + (unicodeDecimal - 0xDC00) + + 0x10000; + UnescapeUtf8Hex(fullUnicode, ref writePosition, unescapedString); + highSurrogate = null; + } + else + { + if (highSurrogate != null) + { + throw new Utf8EncodingException("High surrogate not followed by low surrogate."); + } + UnescapeUtf8Hex(unicodeDecimal, ref writePosition, unescapedString); + } } else { @@ -66,7 +77,7 @@ public static void Unescape( throw new Utf8EncodingException( string.Format( Utf8Helper_InvalidEscapeChar, - (char)code)); + (char) code)); } } else @@ -82,55 +93,39 @@ public static void Unescape( } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static int UnescapeUtf8Hex(byte a, byte b, byte c, byte d) + => (HexToDecimal(a) << 12) | (HexToDecimal(b) << 8) | (HexToDecimal(c) << 4) | HexToDecimal(d); + public static void UnescapeUtf8Hex( - byte a, byte b, byte c, byte d, + int unicodeDecimal, ref int writePosition, Span unescapedString) { - var unicodeDecimal = (HexToDecimal(a) << 12) - | (HexToDecimal(b) << 8) - | (HexToDecimal(c) << 4) - | HexToDecimal(d); - - if (unicodeDecimal is >= 0 and <= 127) + if (unicodeDecimal < 0x80) { - unescapedString[writePosition++] = (byte)unicodeDecimal; + unescapedString[writePosition++] = (byte) unicodeDecimal; } - else if (unicodeDecimal is >= 128 and <= 2047) + else if (unicodeDecimal < 0x800) { - var bytesToShift = unicodeDecimal & _shiftBytesMask; - unicodeDecimal -= bytesToShift; - bytesToShift <<= 2; - unicodeDecimal += _utf8TwoByteMask + bytesToShift; - - unescapedString[writePosition++] = (byte)(unicodeDecimal >> 8); - unescapedString[writePosition++] = (byte)unicodeDecimal; + unescapedString[writePosition++] = (byte) (0xC0 | (unicodeDecimal >> 6)); + unescapedString[writePosition++] = (byte) (0x80 | (unicodeDecimal & 0x3F)); } - else if (unicodeDecimal is >= 2048 and <= 65535) + else if (unicodeDecimal < 0x10000) { - var bytesToShift = unicodeDecimal & _shiftBytesMask; - unicodeDecimal -= bytesToShift; - - var third = (bytesToShift >> 12) << 12; - var second = bytesToShift - third; - - second <<= 2; - third <<= 4; - - unicodeDecimal += _utf8ThreeByteMask + second + third; - - unescapedString[writePosition++] = (byte)(unicodeDecimal >> 16); - unescapedString[writePosition++] = (byte)(unicodeDecimal >> 8); - unescapedString[writePosition++] = (byte)unicodeDecimal; + unescapedString[writePosition++] = (byte) (0xE0 | (unicodeDecimal >> 12)); + unescapedString[writePosition++] = (byte) (0x80 | ((unicodeDecimal >> 6) & 0x3F)); + unescapedString[writePosition++] = (byte) (0x80 | (unicodeDecimal & 0x3F)); } else { - throw new NotSupportedException( - "UTF-8 characters with four bytes are not supported."); + unescapedString[writePosition++] = (byte) (0xF0 | (unicodeDecimal >> 18)); + unescapedString[writePosition++] = (byte) (0x80 | ((unicodeDecimal >> 12) & 0x3F)); + unescapedString[writePosition++] = (byte) (0x80 | ((unicodeDecimal >> 6) & 0x3F)); + unescapedString[writePosition++] = (byte) (0x80 | (unicodeDecimal & 0x3F)); } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] private static int HexToDecimal(int a) { @@ -142,4 +137,4 @@ private static int HexToDecimal(int a) _ => -1 }; } -} +} \ No newline at end of file diff --git a/src/HotChocolate/Language/test/Language.Tests/Parser/ParseUtf8SurrogatePairTests.cs b/src/HotChocolate/Language/test/Language.Tests/Parser/ParseUtf8SurrogatePairTests.cs new file mode 100644 index 00000000000..d5bd06eced9 --- /dev/null +++ b/src/HotChocolate/Language/test/Language.Tests/Parser/ParseUtf8SurrogatePairTests.cs @@ -0,0 +1,19 @@ +using System.Text; +using Newtonsoft.Json; +using Xunit; + +namespace HotChocolate.Language; + +public class ParseUtf8SurrogatePairTests +{ + [Fact] + public void Handle_UTF8_Surrogate_Pairs_Correctly() + { + var emojiBytes = System.Text.Json.JsonSerializer.SerializeToUtf8Bytes("😀"); + var reader = new Utf8GraphQLReader(emojiBytes); + Assert.Equal("😀", System.Text.Json.JsonSerializer.Deserialize(emojiBytes)); + Assert.Equal("😀", JsonConvert.DeserializeObject(Encoding.UTF8.GetString(emojiBytes))); + Assert.True(reader.Read()); + Assert.Equal("😀", reader.GetString()); + } +} \ No newline at end of file