Improve perf of Utf8Parser.TryParse(out [u]long, default) (#52423)

dotnet · Jul 12, 2021 · f787f38 · f787f38
1 parent 1509b1a
commit f787f38
Show file tree

Hide file tree

Showing 2 changed files with 142 additions and 122 deletions.
diff --git a/.../System.Private.CoreLib/src/System/Buffers/Text/Utf8Parser/Utf8Parser.Integer.Signed.D.cs b/.../System.Private.CoreLib/src/System/Buffers/Text/Utf8Parser/Utf8Parser.Integer.Signed.D.cs
@@ -336,107 +336,121 @@ private static bool TryParseInt32D(ReadOnlySpan<byte> source, out int value, out
 
         private static bool TryParseInt64D(ReadOnlySpan<byte> source, out long value, out int bytesConsumed)
         {
-            if (source.Length < 1)
-            {
-                bytesConsumed = 0;
-                value = default;
-                return false;
-            }
+            long sign = 0; // 0 if the value is positive, -1 if the value is negative
+            int idx = 0;
 
-            int indexOfFirstDigit = 0;
-            int sign = 1;
-            if (source[0] == '-')
+            // We use 'nuint' for the firstChar and nextChar data types in this method because
+            // it gives us a free early zero-extension to 64 bits when running on a 64-bit platform.
+
+            nuint firstChar;
+            while (true)
             {
-                indexOfFirstDigit = 1;
-                sign = -1;
+                if ((uint)idx >= (uint)source.Length) { goto FalseExit; }
+                firstChar = (uint)source[idx] - '0';
+                if ((uint)firstChar <= 9) { break; }
 
-                if (source.Length <= indexOfFirstDigit)
+                // We saw something that wasn't a digit. If it's a '+' or a '-',
+                // we'll set the 'sign' value appropriately and resume the "read
+                // first char" loop from the next index. If this loops more than
+                // once (idx != 0), it means we saw a sign character followed by
+                // a non-digit character, which should be considered an error.
+
+                if (idx != 0)
                 {
-                    bytesConsumed = 0;
-                    value = default;
-                    return false;
+                    goto FalseExit;
                 }
-            }
-            else if (source[0] == '+')
-            {
-                indexOfFirstDigit = 1;
 
-                if (source.Length <= indexOfFirstDigit)
+                idx++;
+
+                if ((uint)firstChar == unchecked((uint)('-' - '0')))
+                {
+                    sign--; // set to -1
+                }
+                else if ((uint)firstChar != unchecked((uint)('+' - '0')))
                 {
-                    bytesConsumed = 0;
-                    value = default;
-                    return false;
+                    goto FalseExit; // not a digit, not '-', and not '+'; fail
                 }
             }
 
-            int overflowLength = ParserHelpers.Int64OverflowLength + indexOfFirstDigit;
+            ulong parsedValue = firstChar;
+            int overflowLength = ParserHelpers.Int64OverflowLength + idx; // +idx to account for any sign char we read
+            idx++;
 
-            // Parse the first digit separately. If invalid here, we need to return false.
-            long firstDigit = source[indexOfFirstDigit] - 48; // '0'
-            if (firstDigit < 0 || firstDigit > 9)
-            {
-                bytesConsumed = 0;
-                value = default;
-                return false;
-            }
-            ulong parsedValue = (ulong)firstDigit;
+            // At this point, we successfully read a single digit character.
+            // The only failure condition from here on out is integer overflow.
 
             if (source.Length < overflowLength)
             {
-                // Length is less than Parsers.Int64OverflowLength; overflow is not possible
-                for (int index = indexOfFirstDigit + 1; index < source.Length; index++)
+                // If the input span is short enough such that integer overflow isn't an issue,
+                // don't bother performing overflow checks. Just keep shifting in new digits
+                // until we see a non-digit character or until we've exhausted our input buffer.
+
+                while (true)
                 {
-                    long nextDigit = source[index] - 48; // '0'
-                    if (nextDigit < 0 || nextDigit > 9)
-                    {
-                        bytesConsumed = index;
-                        value = ((long)parsedValue) * sign;
-                        return true;
-                    }
-                    parsedValue = parsedValue * 10 + (ulong)nextDigit;
+                    if ((uint)idx >= (uint)source.Length) { break; } // EOF
+                    nuint nextChar = (uint)source[idx] - '0';
+                    if ((uint)nextChar > 9) { break; } // not a digit
+                    parsedValue = parsedValue * 10 + nextChar;
+                    idx++;
                 }
             }
             else
             {
-                // Length is greater than Parsers.Int64OverflowLength; overflow is only possible after Parsers.Int64OverflowLength
-                // digits. There may be no overflow after Parsers.Int64OverflowLength if there are leading zeroes.
-                for (int index = indexOfFirstDigit + 1; index < overflowLength - 1; index++)
-                {
-                    long nextDigit = source[index] - 48; // '0'
-                    if (nextDigit < 0 || nextDigit > 9)
-                    {
-                        bytesConsumed = index;
-                        value = ((long)parsedValue) * sign;
-                        return true;
-                    }
-                    parsedValue = parsedValue * 10 + (ulong)nextDigit;
-                }
-                for (int index = overflowLength - 1; index < source.Length; index++)
+                while (true)
                 {
-                    long nextDigit = source[index] - 48; // '0'
-                    if (nextDigit < 0 || nextDigit > 9)
+                    if ((uint)idx >= (uint)source.Length) { break; } // EOF
+                    nuint nextChar = (uint)source[idx] - '0';
+                    if ((uint)nextChar > 9) { break; } // not a digit
+                    idx++;
+
+                    // The const below is the smallest unsigned x for which "x * 10 + 9"
+                    // might overflow long.MaxValue. If the current accumulator is below
+                    // this const, there's no risk of overflowing.
+
+                    const ulong OverflowRisk = 0x0CCC_CCCC_CCCC_CCCCul;
+
+                    if (parsedValue < OverflowRisk)
                     {
-                        bytesConsumed = index;
-                        value = ((long)parsedValue) * sign;
-                        return true;
+                        parsedValue = parsedValue * 10 + nextChar;
+                        continue;
                     }
-                    // If parsedValue > (long.MaxValue / 10), any more appended digits will cause overflow.
-                    // if parsedValue == (long.MaxValue / 10), any nextDigit greater than 7 or 8 (depending on sign) implies overflow.
-                    bool positive = sign > 0;
-                    bool nextDigitTooLarge = nextDigit > 8 || (positive && nextDigit > 7);
-                    if (parsedValue > long.MaxValue / 10 || parsedValue == long.MaxValue / 10 && nextDigitTooLarge)
+
+                    // If the current accumulator is exactly equal to the const above,
+                    // then "accumulator * 10 + 7" is the highest we can go without overflowing
+                    // long.MaxValue. (If we know the value is negative, we can instead allow
+                    // +8, since the range of negative numbers is one higher than the range of
+                    // positive numbers.) This also implies that if the current accumulator
+                    // is higher than the const above, there's no hope that we'll succeed,
+                    // so we may as well just fail now.
+                    //
+                    // The (nextChar + sign) trick below works because sign is 0 or -1,
+                    // so if sign is -1 then this actually checks that nextChar > 8.
+                    // n.b. signed arithmetic below because nextChar may be 0.
+
+                    if (parsedValue != OverflowRisk || (int)nextChar + (int)sign > 7)
                     {
-                        bytesConsumed = 0;
-                        value = default;
-                        return false;
+                        goto FalseExit;
                     }
-                    parsedValue = parsedValue * 10 + (ulong)nextDigit;
+
+                    parsedValue = OverflowRisk * 10 + nextChar;
                 }
             }
 
-            bytesConsumed = source.Length;
-            value = ((long)parsedValue) * sign;
+            // 'sign' is 0 for non-negative and -1 for negative. This allows us to perform
+            // cheap arithmetic + bitwise operations to mimic a multiplication by 1 or -1
+            // without incurring the cost of an actual multiplication operation.
+            //
+            // If sign = 0,  this becomes value = (parsedValue ^  0) -   0  = parsedValue
+            // If sign = -1, this becomes value = (parsedValue ^ -1) - (-1) = ~parsedValue + 1 = -parsedValue
+
+            bytesConsumed = idx;
+            value = ((long)parsedValue ^ sign) - sign;
             return true;
+
+        FalseExit:
+            bytesConsumed = 0;
+            value = default;
+            return false;
         }
     }
 }
diff --git a/...ystem.Private.CoreLib/src/System/Buffers/Text/Utf8Parser/Utf8Parser.Integer.Unsigned.D.cs b/...ystem.Private.CoreLib/src/System/Buffers/Text/Utf8Parser/Utf8Parser.Integer.Unsigned.D.cs
@@ -277,77 +277,83 @@ private static bool TryParseUInt32D(ReadOnlySpan<byte> source, out uint value, o
 
         private static bool TryParseUInt64D(ReadOnlySpan<byte> source, out ulong value, out int bytesConsumed)
         {
-            if (source.Length < 1)
+            if (source.IsEmpty)
             {
-                bytesConsumed = 0;
-                value = default;
-                return false;
+                goto FalseExit;
             }
 
+            // We use 'nuint' for the firstDigit and nextChar data types in this method because
+            // it gives us a free early zero-extension to 64 bits when running on a 64-bit platform.
+            //
             // Parse the first digit separately. If invalid here, we need to return false.
-            ulong firstDigit = source[0] - 48u; // '0'
-            if (firstDigit > 9)
-            {
-                bytesConsumed = 0;
-                value = default;
-                return false;
-            }
+
+            nuint firstDigit = (uint)source[0] - '0';
+            if ((uint)firstDigit > 9) { goto FalseExit; }
             ulong parsedValue = firstDigit;
 
-            if (source.Length < ParserHelpers.Int64OverflowLength)
+            // At this point, we successfully read a single digit character.
+            // The only failure condition from here on out is integer overflow.
+
+            int idx = 1;
+            if (source.Length < ParserHelpers.UInt64OverflowLength)
             {
-                // Length is less than Parsers.Int64OverflowLength; overflow is not possible
-                for (int index = 1; index < source.Length; index++)
+                // If the input span is short enough such that integer overflow isn't an issue,
+                // don't bother performing overflow checks. Just keep shifting in new digits
+                // until we see a non-digit character or until we've exhausted our input buffer.
+
+                while (true)
                 {
-                    ulong nextDigit = source[index] - 48u; // '0'
-                    if (nextDigit > 9)
-                    {
-                        bytesConsumed = index;
-                        value = parsedValue;
-                        return true;
-                    }
-                    parsedValue = parsedValue * 10 + nextDigit;
+                    if ((uint)idx >= (uint)source.Length) { break; } // EOF
+                    nuint nextChar = (uint)source[idx] - '0';
+                    if ((uint)nextChar > 9) { break; } // not a digit
+                    parsedValue = parsedValue * 10 + nextChar;
+                    idx++;
                 }
             }
             else
             {
-                // Length is greater than Parsers.Int64OverflowLength; overflow is only possible after Parsers.Int64OverflowLength
-                // digits. There may be no overflow after Parsers.Int64OverflowLength if there are leading zeroes.
-                for (int index = 1; index < ParserHelpers.Int64OverflowLength - 1; index++)
+                while (true)
                 {
-                    ulong nextDigit = source[index] - 48u; // '0'
-                    if (nextDigit > 9)
-                    {
-                        bytesConsumed = index;
-                        value = parsedValue;
-                        return true;
-                    }
-                    parsedValue = parsedValue * 10 + nextDigit;
-                }
-                for (int index = ParserHelpers.Int64OverflowLength - 1; index < source.Length; index++)
-                {
-                    ulong nextDigit = source[index] - 48u; // '0'
-                    if (nextDigit > 9)
+                    if ((uint)idx >= (uint)source.Length) { break; } // EOF
+                    nuint nextChar = (uint)source[idx] - '0';
+                    if ((uint)nextChar > 9) { break; } // not a digit
+                    idx++;
+
+                    // The const below is the smallest unsigned x for which "x * 10 + 9"
+                    // might overflow ulong.MaxValue. If the current accumulator is below
+                    // this const, there's no risk of overflowing.
+
+                    const ulong OverflowRisk = 0x1999_9999_9999_9999ul;
+
+                    if (parsedValue < OverflowRisk)
                     {
-                        bytesConsumed = index;
-                        value = parsedValue;
-                        return true;
+                        parsedValue = parsedValue * 10 + nextChar;
+                        continue;
                     }
-                    // If parsedValue > (ulong.MaxValue / 10), any more appended digits will cause overflow.
-                    // if parsedValue == (ulong.MaxValue / 10), any nextDigit greater than 5 implies overflow.
-                    if (parsedValue > ulong.MaxValue / 10 || (parsedValue == ulong.MaxValue / 10 && nextDigit > 5))
+
+                    // If the current accumulator is exactly equal to the const above,
+                    // then "accumulator * 10 + 5" is the highest we can go without overflowing
+                    // ulong.MaxValue. This also implies that if the current accumulator
+                    // is higher than the const above, there's no hope that we'll succeed,
+                    // so we may as well just fail now.
+
+                    if (parsedValue != OverflowRisk || (uint)nextChar > 5)
                     {
-                        bytesConsumed = 0;
-                        value = default;
-                        return false;
+                        goto FalseExit;
                     }
-                    parsedValue = parsedValue * 10 + nextDigit;
+
+                    parsedValue = OverflowRisk * 10 + nextChar;
                 }
             }
 
-            bytesConsumed = source.Length;
+            bytesConsumed = idx;
             value = parsedValue;
             return true;
+
+        FalseExit:
+            bytesConsumed = 0;
+            value = default;
+            return false;
         }
     }
 }