Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle UTF8 Surrogate Pairs in Parser #6503

Merged
merged 2 commits into from
Sep 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 55 additions & 60 deletions src/HotChocolate/Language/src/Language.Utf8/Utf8Helper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ namespace HotChocolate.Language;

internal static class Utf8Helper
{
private const int _utf8TwoByteMask = 0b1100_0000_1000_0000;
private const int _utf8ThreeByteMask = 0b1110_0000_1000_0000_1000_0000;
private const int _shiftBytesMask = 0b1111_1111_1100_0000;

public static void Unescape(
in ReadOnlySpan<byte> escapedString,
ref Span<byte> unescapedString,
Expand All @@ -18,6 +14,7 @@ public static void Unescape(
var readPosition = -1;
var writePosition = 0;
var eofPosition = escapedString.Length - 1;
int? highSurrogate = null;

if (escapedString.Length > 0)
{
Expand All @@ -29,32 +26,46 @@ public static void Unescape(
{
code = escapedString[++readPosition];

if (isBlockString && code == GraphQLConstants.Quote)
{
if (escapedString[readPosition + 1] == GraphQLConstants.Quote
&& escapedString[readPosition + 2] == GraphQLConstants.Quote)
{
readPosition += 2;
unescapedString[writePosition++] = GraphQLConstants.Quote;
unescapedString[writePosition++] = GraphQLConstants.Quote;
unescapedString[writePosition++] = GraphQLConstants.Quote;
}
else
{
throw new Utf8EncodingException(Utf8Helper_InvalidQuoteEscapeCount);
}
}
else if (code.IsValidEscapeCharacter())
if (code.IsValidEscapeCharacter())
{
if (code == GraphQLConstants.U)
{
UnescapeUtf8Hex(
var unicodeDecimal = UnescapeUtf8Hex(
escapedString[++readPosition],
escapedString[++readPosition],
escapedString[++readPosition],
escapedString[++readPosition],
ref writePosition,
unescapedString);
escapedString[++readPosition]);

if (unicodeDecimal >= 0xD800 && unicodeDecimal <= 0xDBFF)
{
// High surrogate
if (highSurrogate != null)
{
throw new Utf8EncodingException("Unexpected high surrogate.");
}
highSurrogate = unicodeDecimal;
}
else if (unicodeDecimal >= 0xDC00 && unicodeDecimal <= 0xDFFF)
{
// Low surrogate
if (highSurrogate == null)
{
throw new Utf8EncodingException("Unexpected low surrogate.");
}
var fullUnicode = ((highSurrogate.Value - 0xD800) << 10) +
(unicodeDecimal - 0xDC00) +
0x10000;
UnescapeUtf8Hex(fullUnicode, ref writePosition, unescapedString);
highSurrogate = null;
}
else
{
if (highSurrogate != null)
{
throw new Utf8EncodingException("High surrogate not followed by low surrogate.");
}
UnescapeUtf8Hex(unicodeDecimal, ref writePosition, unescapedString);
}
}
else
{
Expand All @@ -66,7 +77,7 @@ public static void Unescape(
throw new Utf8EncodingException(
string.Format(
Utf8Helper_InvalidEscapeChar,
(char)code));
(char) code));
}
}
else
Expand All @@ -82,55 +93,39 @@ public static void Unescape(
}
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int UnescapeUtf8Hex(byte a, byte b, byte c, byte d)
=> (HexToDecimal(a) << 12) | (HexToDecimal(b) << 8) | (HexToDecimal(c) << 4) | HexToDecimal(d);

public static void UnescapeUtf8Hex(
byte a, byte b, byte c, byte d,
int unicodeDecimal,
ref int writePosition,
Span<byte> unescapedString)
{
var unicodeDecimal = (HexToDecimal(a) << 12)
| (HexToDecimal(b) << 8)
| (HexToDecimal(c) << 4)
| HexToDecimal(d);

if (unicodeDecimal is >= 0 and <= 127)
if (unicodeDecimal < 0x80)
{
unescapedString[writePosition++] = (byte)unicodeDecimal;
unescapedString[writePosition++] = (byte) unicodeDecimal;
}
else if (unicodeDecimal is >= 128 and <= 2047)
else if (unicodeDecimal < 0x800)
{
var bytesToShift = unicodeDecimal & _shiftBytesMask;
unicodeDecimal -= bytesToShift;
bytesToShift <<= 2;
unicodeDecimal += _utf8TwoByteMask + bytesToShift;

unescapedString[writePosition++] = (byte)(unicodeDecimal >> 8);
unescapedString[writePosition++] = (byte)unicodeDecimal;
unescapedString[writePosition++] = (byte) (0xC0 | (unicodeDecimal >> 6));
unescapedString[writePosition++] = (byte) (0x80 | (unicodeDecimal & 0x3F));
}
else if (unicodeDecimal is >= 2048 and <= 65535)
else if (unicodeDecimal < 0x10000)
{
var bytesToShift = unicodeDecimal & _shiftBytesMask;
unicodeDecimal -= bytesToShift;

var third = (bytesToShift >> 12) << 12;
var second = bytesToShift - third;

second <<= 2;
third <<= 4;

unicodeDecimal += _utf8ThreeByteMask + second + third;

unescapedString[writePosition++] = (byte)(unicodeDecimal >> 16);
unescapedString[writePosition++] = (byte)(unicodeDecimal >> 8);
unescapedString[writePosition++] = (byte)unicodeDecimal;
unescapedString[writePosition++] = (byte) (0xE0 | (unicodeDecimal >> 12));
unescapedString[writePosition++] = (byte) (0x80 | ((unicodeDecimal >> 6) & 0x3F));
unescapedString[writePosition++] = (byte) (0x80 | (unicodeDecimal & 0x3F));
}
else
{
throw new NotSupportedException(
"UTF-8 characters with four bytes are not supported.");
unescapedString[writePosition++] = (byte) (0xF0 | (unicodeDecimal >> 18));
unescapedString[writePosition++] = (byte) (0x80 | ((unicodeDecimal >> 12) & 0x3F));
unescapedString[writePosition++] = (byte) (0x80 | ((unicodeDecimal >> 6) & 0x3F));
unescapedString[writePosition++] = (byte) (0x80 | (unicodeDecimal & 0x3F));
}
}


[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static int HexToDecimal(int a)
{
Expand All @@ -142,4 +137,4 @@ private static int HexToDecimal(int a)
_ => -1
};
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using System.Text;
using Newtonsoft.Json;
using Xunit;

namespace HotChocolate.Language;

public class ParseUtf8SurrogatePairTests
{
[Fact]
public void Handle_UTF8_Surrogate_Pairs_Correctly()
{
var emojiBytes = System.Text.Json.JsonSerializer.SerializeToUtf8Bytes("😀");
var reader = new Utf8GraphQLReader(emojiBytes);
Assert.Equal("😀", System.Text.Json.JsonSerializer.Deserialize<string>(emojiBytes));
Assert.Equal("😀", JsonConvert.DeserializeObject<string>(Encoding.UTF8.GetString(emojiBytes)));
Assert.True(reader.Read());
Assert.Equal("😀", reader.GetString());
}
}