Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve perf of Utf8Parser.TryParse for int64 and uint64 #52423

Merged
merged 1 commit into from
Jul 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -336,107 +336,121 @@ private static bool TryParseInt32D(ReadOnlySpan<byte> source, out int value, out

private static bool TryParseInt64D(ReadOnlySpan<byte> source, out long value, out int bytesConsumed)
{
if (source.Length < 1)
{
bytesConsumed = 0;
value = default;
return false;
}
long sign = 0; // 0 if the value is positive, -1 if the value is negative
int idx = 0;

int indexOfFirstDigit = 0;
int sign = 1;
if (source[0] == '-')
// We use 'nuint' for the firstChar and nextChar data types in this method because
// it gives us a free early zero-extension to 64 bits when running on a 64-bit platform.

nuint firstChar;
while (true)
{
indexOfFirstDigit = 1;
sign = -1;
if ((uint)idx >= (uint)source.Length) { goto FalseExit; }
firstChar = (uint)source[idx] - '0';
if ((uint)firstChar <= 9) { break; }

if (source.Length <= indexOfFirstDigit)
// We saw something that wasn't a digit. If it's a '+' or a '-',
// we'll set the 'sign' value appropriately and resume the "read
// first char" loop from the next index. If this loops more than
// once (idx != 0), it means we saw a sign character followed by
// a non-digit character, which should be considered an error.

if (idx != 0)
{
bytesConsumed = 0;
value = default;
return false;
goto FalseExit;
}
}
else if (source[0] == '+')
{
indexOfFirstDigit = 1;

if (source.Length <= indexOfFirstDigit)
idx++;

if ((uint)firstChar == unchecked((uint)('-' - '0')))
{
sign--; // set to -1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
sign--; // set to -1
sign = -1;

Make it explicit?
The JIT will emit code like mov rax, 0xffffffffffffffff anyway.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The JIT emits dec rax for this scenario. I think it doesn't propagate the "if (idx != 0) { FAIL; }" assertion above for some reason.
I can set sign = -1; explicitly, but it does increase the codegen by 7 - 8 bytes. Maybe it's too much of a microoptimization to worry about and the cleaner code is better?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I'd keep it as is. Thanks for the info.

}
else if ((uint)firstChar != unchecked((uint)('+' - '0')))
{
bytesConsumed = 0;
value = default;
return false;
goto FalseExit; // not a digit, not '-', and not '+'; fail
}
}

int overflowLength = ParserHelpers.Int64OverflowLength + indexOfFirstDigit;
ulong parsedValue = firstChar;
int overflowLength = ParserHelpers.Int64OverflowLength + idx; // +idx to account for any sign char we read
idx++;

// Parse the first digit separately. If invalid here, we need to return false.
long firstDigit = source[indexOfFirstDigit] - 48; // '0'
if (firstDigit < 0 || firstDigit > 9)
{
bytesConsumed = 0;
value = default;
return false;
}
ulong parsedValue = (ulong)firstDigit;
// At this point, we successfully read a single digit character.
// The only failure condition from here on out is integer overflow.

if (source.Length < overflowLength)
{
// Length is less than Parsers.Int64OverflowLength; overflow is not possible
for (int index = indexOfFirstDigit + 1; index < source.Length; index++)
// If the input span is short enough such that integer overflow isn't an issue,
// don't bother performing overflow checks. Just keep shifting in new digits
// until we see a non-digit character or until we've exhausted our input buffer.

while (true)
{
long nextDigit = source[index] - 48; // '0'
if (nextDigit < 0 || nextDigit > 9)
{
bytesConsumed = index;
value = ((long)parsedValue) * sign;
return true;
}
parsedValue = parsedValue * 10 + (ulong)nextDigit;
if ((uint)idx >= (uint)source.Length) { break; } // EOF
nuint nextChar = (uint)source[idx] - '0';
if ((uint)nextChar > 9) { break; } // not a digit
parsedValue = parsedValue * 10 + nextChar;
idx++;
}
}
else
{
// Length is greater than Parsers.Int64OverflowLength; overflow is only possible after Parsers.Int64OverflowLength
// digits. There may be no overflow after Parsers.Int64OverflowLength if there are leading zeroes.
for (int index = indexOfFirstDigit + 1; index < overflowLength - 1; index++)
{
long nextDigit = source[index] - 48; // '0'
if (nextDigit < 0 || nextDigit > 9)
{
bytesConsumed = index;
value = ((long)parsedValue) * sign;
return true;
}
parsedValue = parsedValue * 10 + (ulong)nextDigit;
}
for (int index = overflowLength - 1; index < source.Length; index++)
while (true)
{
long nextDigit = source[index] - 48; // '0'
if (nextDigit < 0 || nextDigit > 9)
if ((uint)idx >= (uint)source.Length) { break; } // EOF
nuint nextChar = (uint)source[idx] - '0';
if ((uint)nextChar > 9) { break; } // not a digit
idx++;

// The const below is the smallest unsigned x for which "x * 10 + 9"
// might overflow long.MaxValue. If the current accumulator is below
// this const, there's no risk of overflowing.

const ulong OverflowRisk = 0x0CCC_CCCC_CCCC_CCCCul;

if (parsedValue < OverflowRisk)
{
bytesConsumed = index;
value = ((long)parsedValue) * sign;
return true;
parsedValue = parsedValue * 10 + nextChar;
continue;
}
// If parsedValue > (long.MaxValue / 10), any more appended digits will cause overflow.
// if parsedValue == (long.MaxValue / 10), any nextDigit greater than 7 or 8 (depending on sign) implies overflow.
bool positive = sign > 0;
bool nextDigitTooLarge = nextDigit > 8 || (positive && nextDigit > 7);
if (parsedValue > long.MaxValue / 10 || parsedValue == long.MaxValue / 10 && nextDigitTooLarge)

// If the current accumulator is exactly equal to the const above,
// then "accumulator * 10 + 7" is the highest we can go without overflowing
// long.MaxValue. (If we know the value is negative, we can instead allow
// +8, since the range of negative numbers is one higher than the range of
// positive numbers.) This also implies that if the current accumulator
// is higher than the const above, there's no hope that we'll succeed,
// so we may as well just fail now.
//
// The (nextChar + sign) trick below works because sign is 0 or -1,
// so if sign is -1 then this actually checks that nextChar > 8.
// n.b. signed arithmetic below because nextChar may be 0.

if (parsedValue != OverflowRisk || (int)nextChar + (int)sign > 7)
{
bytesConsumed = 0;
value = default;
return false;
goto FalseExit;
}
parsedValue = parsedValue * 10 + (ulong)nextDigit;

parsedValue = OverflowRisk * 10 + nextChar;
}
}

bytesConsumed = source.Length;
value = ((long)parsedValue) * sign;
// 'sign' is 0 for non-negative and -1 for negative. This allows us to perform
// cheap arithmetic + bitwise operations to mimic a multiplication by 1 or -1
// without incurring the cost of an actual multiplication operation.
//
// If sign = 0, this becomes value = (parsedValue ^ 0) - 0 = parsedValue
// If sign = -1, this becomes value = (parsedValue ^ -1) - (-1) = ~parsedValue + 1 = -parsedValue

bytesConsumed = idx;
value = ((long)parsedValue ^ sign) - sign;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚀

The same could be done to other types too (like int32 above, etc.)?

return true;

FalseExit:
bytesConsumed = 0;
value = default;
return false;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -277,77 +277,83 @@ private static bool TryParseUInt32D(ReadOnlySpan<byte> source, out uint value, o

private static bool TryParseUInt64D(ReadOnlySpan<byte> source, out ulong value, out int bytesConsumed)
{
if (source.Length < 1)
if (source.IsEmpty)
{
bytesConsumed = 0;
value = default;
return false;
goto FalseExit;
}

// We use 'nuint' for the firstDigit and nextChar data types in this method because
// it gives us a free early zero-extension to 64 bits when running on a 64-bit platform.
//
// Parse the first digit separately. If invalid here, we need to return false.
ulong firstDigit = source[0] - 48u; // '0'
if (firstDigit > 9)
{
bytesConsumed = 0;
value = default;
return false;
}

nuint firstDigit = (uint)source[0] - '0';
if ((uint)firstDigit > 9) { goto FalseExit; }
ulong parsedValue = firstDigit;

if (source.Length < ParserHelpers.Int64OverflowLength)
// At this point, we successfully read a single digit character.
// The only failure condition from here on out is integer overflow.

int idx = 1;
if (source.Length < ParserHelpers.UInt64OverflowLength)
{
// Length is less than Parsers.Int64OverflowLength; overflow is not possible
for (int index = 1; index < source.Length; index++)
// If the input span is short enough such that integer overflow isn't an issue,
// don't bother performing overflow checks. Just keep shifting in new digits
// until we see a non-digit character or until we've exhausted our input buffer.

while (true)
{
ulong nextDigit = source[index] - 48u; // '0'
if (nextDigit > 9)
{
bytesConsumed = index;
value = parsedValue;
return true;
}
parsedValue = parsedValue * 10 + nextDigit;
if ((uint)idx >= (uint)source.Length) { break; } // EOF
nuint nextChar = (uint)source[idx] - '0';
if ((uint)nextChar > 9) { break; } // not a digit
parsedValue = parsedValue * 10 + nextChar;
idx++;
}
}
else
{
// Length is greater than Parsers.Int64OverflowLength; overflow is only possible after Parsers.Int64OverflowLength
// digits. There may be no overflow after Parsers.Int64OverflowLength if there are leading zeroes.
for (int index = 1; index < ParserHelpers.Int64OverflowLength - 1; index++)
while (true)
{
ulong nextDigit = source[index] - 48u; // '0'
if (nextDigit > 9)
{
bytesConsumed = index;
value = parsedValue;
return true;
}
parsedValue = parsedValue * 10 + nextDigit;
}
for (int index = ParserHelpers.Int64OverflowLength - 1; index < source.Length; index++)
{
ulong nextDigit = source[index] - 48u; // '0'
if (nextDigit > 9)
if ((uint)idx >= (uint)source.Length) { break; } // EOF
nuint nextChar = (uint)source[idx] - '0';
if ((uint)nextChar > 9) { break; } // not a digit
idx++;

// The const below is the smallest unsigned x for which "x * 10 + 9"
// might overflow ulong.MaxValue. If the current accumulator is below
// this const, there's no risk of overflowing.

const ulong OverflowRisk = 0x1999_9999_9999_9999ul;

if (parsedValue < OverflowRisk)
{
bytesConsumed = index;
value = parsedValue;
return true;
parsedValue = parsedValue * 10 + nextChar;
continue;
}
// If parsedValue > (ulong.MaxValue / 10), any more appended digits will cause overflow.
// if parsedValue == (ulong.MaxValue / 10), any nextDigit greater than 5 implies overflow.
if (parsedValue > ulong.MaxValue / 10 || (parsedValue == ulong.MaxValue / 10 && nextDigit > 5))

// If the current accumulator is exactly equal to the const above,
// then "accumulator * 10 + 5" is the highest we can go without overflowing
// ulong.MaxValue. This also implies that if the current accumulator
// is higher than the const above, there's no hope that we'll succeed,
// so we may as well just fail now.

if (parsedValue != OverflowRisk || (uint)nextChar > 5)
{
bytesConsumed = 0;
value = default;
return false;
goto FalseExit;
}
parsedValue = parsedValue * 10 + nextDigit;

parsedValue = OverflowRisk * 10 + nextChar;
}
}

bytesConsumed = source.Length;
bytesConsumed = idx;
value = parsedValue;
return true;

FalseExit:
bytesConsumed = 0;
value = default;
return false;
}
}
}