From d89f52300f5b3bb69a79c4333edab441688fd81f Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 29 Jun 2024 20:00:54 +0200 Subject: [PATCH 1/4] Integrate SimdUnicode --- THIRD-PARTY-NOTICES.TXT | 29 ++ .../Text/Unicode/Utf8Utility.Validation.cs | 370 ++++++++++++++++++ 2 files changed, 399 insertions(+) diff --git a/THIRD-PARTY-NOTICES.TXT b/THIRD-PARTY-NOTICES.TXT index 1f17cf6bc1106..0e60dfc0d892d 100644 --- a/THIRD-PARTY-NOTICES.TXT +++ b/THIRD-PARTY-NOTICES.TXT @@ -760,6 +760,35 @@ License for fastmod (https://github.com/lemire/fastmod), ibm-fpgen (https://gith See the License for the specific language governing permissions and limitations under the License. +License for SimdUnicode (https://github.com/simdutf/SimdUnicode) +-------------------------------------- + + Copyright 2024 Daniel Lemire, Nick Nuon + Which is based on "Validating UTF-8 In Less Than One Instruction Per Byte" article available at https://arxiv.org/abs/2010.03090 + (c) John Keiser, Daniel Lemire + + MIT License + + Copyright (c) 2023 SimdUnicode authors + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all + copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. + License for sse4-strstr (https://github.com/WojciechMula/sse4-strstr) -------------------------------------- diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index a542dad72b5c3..43facd91bc93d 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -57,6 +57,11 @@ internal static unsafe partial class Utf8Utility goto ProcessInputOfLessThanDWordSize; } + if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && inputLength > 64) + { + return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + } + byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint); // Begin the main loop. @@ -753,5 +758,370 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit extractedBits = AdvSimd.Arm64.AddPairwise(extractedBits, extractedBits); return extractedBits.AsUInt64().ToScalar(); } + + // The following SIMD acceleration is based SimdUnicode library (https://github.com/simdutf/SimdUnicode) + // (c) Daniel Lemire, Nick Nuon. + // Which is based on "Validating UTF-8 In Less Than One Instruction Per Byte" article available at https://arxiv.org/abs/2010.03090 + // (c) John Keiser, Daniel Lemire + + [CompExactlyDependsOn(typeof(Avx512Vbmi))] + private static byte* GetPointerToFirstInvalidByteAvx512(byte* pInputBuffer, int inputLength, + out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) + { + Debug.Assert(inputLength > 64); + + const byte tooShort = 1 << 0; + const byte tooLong = 1 << 1; + const byte overlong3 = 1 << 2; + const byte surrogate = 1 << 4; + const byte overlong2 = 1 << 5; + const byte twoConts = 1 << 7; + const byte tooLarge = 1 << 3; + const byte tooLarge1000 = 1 << 6; + const byte overlong4 = 1 << 6; + const byte carry = tooShort | tooLong | twoConts; + + int processedLength = 0; + Vector512 prevInputBlock = Vector512.Zero; + Vector512 maxValue = Vector512.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); + Vector512 prevIncomplete = Avx512BW.SubtractSaturate(prevInputBlock, maxValue); + Vector512 shuf1 = Vector512.Create(tooLong, tooLong, tooLong, tooLong, + tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4, + tooLong, tooLong, tooLong, tooLong, + tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4, + tooLong, tooLong, tooLong, tooLong, + tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4, + tooLong, tooLong, tooLong, tooLong, + tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4); + Vector512 shuf2 = Vector512.Create( + carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000); + Vector512 shuf3 = Vector512.Create(tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort); + Vector512 thirdByte = Vector512.Create((byte)(0b11100000u - 0x80)); + Vector512 fourthByte = Vector512.Create((byte)(0b11110000u - 0x80)); + Vector512 v0f = Vector512.Create((byte)0x0F); + Vector512 v80 = Vector512.Create((byte)0x80); + + // The block goes from processedLength to processedLength/16*16. + int countBytes = 0; // number of continuation bytes in the block + int n4 = 0; // number of 4-byte sequences that start in this block + for (; processedLength + 64 <= inputLength; processedLength += 64) + { + Vector512 currentBlock = Vector512.Load(pInputBuffer + processedLength); + ulong mask = currentBlock.ExtractMostSignificantBits(); + if (mask == 0) + { + // We have an ASCII block, no need to process it, but + // we need to check if the previous block was incomplete. + if (Vector512.GreaterThan(prevIncomplete, Vector512.Zero).ExtractMostSignificantBits() != 0) + { + byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + // So the code is correct up to invalidBytePointer + if (invalidBytePointer < pInputBuffer + processedLength) + RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref countBytes); + else + AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); + return invalidBytePointer; + } + + prevIncomplete = Vector512.Zero; + + // Often, we have a lot of ASCII characters in a row. + int localAsciiRun = 64; + if (processedLength + localAsciiRun + 64 <= inputLength) + { + for (; processedLength + localAsciiRun + 64 <= inputLength; localAsciiRun += 64) + { + Vector512 block = Vector512.Load(pInputBuffer + processedLength + localAsciiRun); + if (block.ExtractMostSignificantBits() != 0) + break; + } + processedLength += localAsciiRun - 64; + } + } + else // Contains non-ASCII characters, we need to do non-trivial processing + { + // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. + Vector512 moveMask = Vector512.Create(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); + Vector512 shuffled = Avx512F.PermuteVar16x32x2(currentBlock.AsInt32(), moveMask, prevInputBlock.AsInt32()).AsByte(); + prevInputBlock = currentBlock; + + Vector512 prev1 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 1); + + // takes the XXXX 0000 part of the previous byte + Vector512 byte1High = Avx512BW.Shuffle(shuf1, Avx512BW.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); + Vector512 byte1Low = Avx512BW.Shuffle(shuf2, prev1 & v0f); // takes the 0000 XXXX part of the previous part + + // takes the XXXX 0000 part of the current byte + Vector512 byte2High = Avx512BW.Shuffle(shuf3, Avx512BW.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); + Vector512 sc = byte1High & byte1Low & byte2High; + Vector512 prev2 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 2); + Vector512 prev3 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 3); + Vector512 isThirdByte = Avx512BW.SubtractSaturate(prev2, thirdByte); + Vector512 isFourthByte = Avx512BW.SubtractSaturate(prev3, fourthByte); + Vector512 must23 = isThirdByte | isFourthByte; + Vector512 must23As80 = must23 & v80; + Vector512 error = must23As80 ^ sc; + + if (Vector512.GreaterThan(error, Vector512.Zero).ExtractMostSignificantBits() != 0) + { + byte* invalidBytePointer; + if (processedLength == 0) + invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength); + else + invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + if (invalidBytePointer < pInputBuffer + processedLength) + RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref countBytes); + else + AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = (n4, countBytes); + return invalidBytePointer; + } + + prevIncomplete = Avx512BW.SubtractSaturate(currentBlock, maxValue); + countBytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits()); + // We use two instructions (SubtractSaturate and ExtractMostSignificantBits) to update n4, with one arithmetic operation. + n4 += BitOperations.PopCount(Avx512BW.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits()); + } + } + + // We may still have an error. + bool hasIncomplete = Vector512.GreaterThan(prevIncomplete, Vector512.Zero).ExtractMostSignificantBits() != 0; + if (processedLength < inputLength || hasIncomplete) + { + byte* invalidBytePointer; + if (processedLength == 0 || !hasIncomplete) + invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength); + else + invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + if (invalidBytePointer != pInputBuffer + inputLength) + { + if (invalidBytePointer < pInputBuffer + processedLength) + RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref countBytes); + else + AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); + return invalidBytePointer; + } + AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); + } + + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); + return pInputBuffer + inputLength; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void RemoveCounters(byte* start, byte* end, ref int n4, ref int countBytes) + { + for (byte* p = start; p < end; p++) + { + if ((*p & 0b11000000) == 0b10000000) + countBytes -= 1; + if ((*p & 0b11110000) == 0b11110000) + n4 -= 1; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void AddCounters(byte* start, byte* end, ref int n4, ref int countBytes) + { + for (byte* p = start; p < end; p++) + { + if ((*p & 0b11000000) == 0b10000000) + countBytes += 1; + if ((*p & 0b11110000) == 0b11110000) + n4 += 1; + } + } + + // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of + // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the + // pointer to the first invalid byte. + private static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len) + { + int extraLen = 0; + bool foundLeadingBytes = false; + + for (int i = 0; i <= howFarBack; i++) + { + byte candidateByte = buf[0 - i]; + foundLeadingBytes = (candidateByte & 0b11000000) != 0b10000000; + if (foundLeadingBytes) + { + buf -= i; + extraLen = i; + break; + } + } + + if (!foundLeadingBytes) + return buf - howFarBack; + + int pos = 0; + len += extraLen; + while (pos < len) + { + byte firstByte = buf[pos]; + while (firstByte < 0b10000000) + { + if (++pos == len) + return buf + len; + firstByte = buf[pos]; + } + + int nextPos; + uint codePoint; + if ((firstByte & 0b11100000) == 0b11000000) + { + nextPos = pos + 2; + if (nextPos > len) + return buf + pos; // Too short + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + return buf + pos; // Too short + + // range check + codePoint = ((uint)(firstByte & 0b00011111) << 6) | (uint)(buf[pos + 1] & 0b00111111); + if (codePoint < 0x80 || 0x7ff < codePoint) + return buf + pos; // Overlong + } + else if ((firstByte & 0b11110000) == 0b11100000) + { + nextPos = pos + 3; + if (nextPos > len) + return buf + pos; // Too short + + // range check + codePoint = ((uint)(firstByte & 0b00001111) << 12) | ((uint)(buf[pos + 1] & 0b00111111) << 6) | (uint)(buf[pos + 2] & 0b00111111); + // Either overlong or too large: + if (codePoint < 0x800 || 0xffff < codePoint || (0xd7ff < codePoint && codePoint < 0xe000)) + return buf + pos; + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + return buf + pos; // Too short + if ((buf[pos + 2] & 0b11000000) != 0b10000000) + return buf + pos; // Too short + } + else if ((firstByte & 0b11111000) == 0b11110000) + { + nextPos = pos + 4; + if (nextPos > len) + return buf + pos; + if ((buf[pos + 1] & 0b11000000) != 0b10000000) + return buf + pos; + if ((buf[pos + 2] & 0b11000000) != 0b10000000) + return buf + pos; + if ((buf[pos + 3] & 0b11000000) != 0b10000000) + return buf + pos; + // range check + codePoint = ((uint)(firstByte & 0b00000111) << 18) | ((uint)(buf[pos + 1] & 0b00111111) << 12) | + ((uint)(buf[pos + 2] & 0b00111111) << 6) | (uint)(buf[pos + 3] & 0b00111111); + if (codePoint <= 0xffff || 0x10ffff < codePoint) + return buf + pos; + } + else + { + // we may have a continuation/too long error + return buf + pos; + } + pos = nextPos; + } + return buf + len; // no error + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static (int utfadjust, int scalaradjust) CalculateN2N3FinalSimdAdjustments(int n4, int countBytes) + { + int n3 = -2 * n4 + 2 * countBytes; + int n2 = n4 - 3 * countBytes; + int utfAdjust = -2 * n4 - 2 * n3 - n2; + int scalarAdjust = -n4; + return (utfAdjust, scalarAdjust); + } } } From c068c0b30fa6278392abefd4923ba1fd2c33d469 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Sat, 29 Jun 2024 21:44:05 +0200 Subject: [PATCH 2/4] change to avx2 --- .../Text/Unicode/Utf8Utility.Validation.cs | 298 +++++++++--------- 1 file changed, 144 insertions(+), 154 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 43facd91bc93d..1a71c88be78fd 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -57,9 +57,9 @@ internal static unsafe partial class Utf8Utility goto ProcessInputOfLessThanDWordSize; } - if (Vector512.IsHardwareAccelerated && Avx512Vbmi.IsSupported && inputLength > 64) + if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && inputLength > 32) { - return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); + return GetPointerToFirstInvalidByte_Avx2(pInputBuffer, inputLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment); } byte* pFinalPosWhereCanReadDWordFromInputBuffer = pInputBuffer + (uint)inputLength - sizeof(uint); @@ -764,11 +764,11 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit // Which is based on "Validating UTF-8 In Less Than One Instruction Per Byte" article available at https://arxiv.org/abs/2010.03090 // (c) John Keiser, Daniel Lemire - [CompExactlyDependsOn(typeof(Avx512Vbmi))] - private static byte* GetPointerToFirstInvalidByteAvx512(byte* pInputBuffer, int inputLength, + [CompExactlyDependsOn(typeof(Avx2))] + private static byte* GetPointerToFirstInvalidByte_Avx2(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment) { - Debug.Assert(inputLength > 64); + Debug.Assert(inputLength > 32); const byte tooShort = 1 << 0; const byte tooLong = 1 << 1; @@ -782,217 +782,207 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit const byte carry = tooShort | tooLong | twoConts; int processedLength = 0; - Vector512 prevInputBlock = Vector512.Zero; - Vector512 maxValue = Vector512.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, - 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); - Vector512 prevIncomplete = Avx512BW.SubtractSaturate(prevInputBlock, maxValue); - Vector512 shuf1 = Vector512.Create(tooLong, tooLong, tooLong, tooLong, - tooLong, tooLong, tooLong, tooLong, - twoConts, twoConts, twoConts, twoConts, - tooShort | overlong2, - tooShort, - tooShort | overlong3 | surrogate, - tooShort | tooLarge | tooLarge1000 | overlong4, - tooLong, tooLong, tooLong, tooLong, - tooLong, tooLong, tooLong, tooLong, - twoConts, twoConts, twoConts, twoConts, - tooShort | overlong2, - tooShort, - tooShort | overlong3 | surrogate, - tooShort | tooLarge | tooLarge1000 | overlong4, - tooLong, tooLong, tooLong, tooLong, - tooLong, tooLong, tooLong, tooLong, - twoConts, twoConts, twoConts, twoConts, - tooShort | overlong2, - tooShort, - tooShort | overlong3 | surrogate, - tooShort | tooLarge | tooLarge1000 | overlong4, - tooLong, tooLong, tooLong, tooLong, - tooLong, tooLong, tooLong, tooLong, - twoConts, twoConts, twoConts, twoConts, - tooShort | overlong2, - tooShort, - tooShort | overlong3 | surrogate, - tooShort | tooLarge | tooLarge1000 | overlong4); - Vector512 shuf2 = Vector512.Create( - carry | overlong3 | overlong2 | overlong4, - carry | overlong2, - carry, carry, - carry | tooLarge, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000 | surrogate, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | overlong3 | overlong2 | overlong4, - carry | overlong2, - carry, carry, - carry | tooLarge, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000 | surrogate, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | overlong3 | overlong2 | overlong4, - carry | overlong2, - carry, carry, - carry | tooLarge, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000 | surrogate, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | overlong3 | overlong2 | overlong4, - carry | overlong2, - carry, carry, - carry | tooLarge, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000 | surrogate, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000); - Vector512 shuf3 = Vector512.Create(tooShort, tooShort, tooShort, tooShort, - tooShort, tooShort, tooShort, tooShort, - tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, - tooLong | overlong2 | twoConts | overlong3 | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, - tooShort, tooShort, tooShort, tooShort, - tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, - tooLong | overlong2 | twoConts | overlong3 | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooShort, tooShort, tooShort, tooShort, - tooShort, tooShort, tooShort, tooShort, - tooShort, tooShort, tooShort, tooShort, - tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, - tooLong | overlong2 | twoConts | overlong3 | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, - tooShort, tooShort, tooShort, tooShort, - tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, - tooLong | overlong2 | twoConts | overlong3 | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooShort, tooShort, tooShort, tooShort); - Vector512 thirdByte = Vector512.Create((byte)(0b11100000u - 0x80)); - Vector512 fourthByte = Vector512.Create((byte)(0b11110000u - 0x80)); - Vector512 v0f = Vector512.Create((byte)0x0F); - Vector512 v80 = Vector512.Create((byte)0x80); + Vector256 prevInputBlock = Vector256.Zero; + Vector256 maxValue = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, + 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); + Vector256 prevIncomplete = Avx2.SubtractSaturate(prevInputBlock, maxValue); + Vector256 shuf1 = Vector256.Create(tooLong, tooLong, tooLong, tooLong, + tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4, + tooLong, tooLong, tooLong, tooLong, + tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4); + Vector256 shuf2 = Vector256.Create(carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, + carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000); + Vector256 shuf3 = Vector256.Create(tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort); + Vector256 thirdByte = Vector256.Create((byte)(0b11100000u - 0x80)); + Vector256 fourthByte = Vector256.Create((byte)(0b11110000u - 0x80)); + Vector256 v0f = Vector256.Create((byte)0x0F); + Vector256 v80 = Vector256.Create((byte)0x80); + // So we want to count the number of 4-byte sequences, + // the number of 4-byte sequences, 3-byte sequences, and + // the number of 2-byte sequences. + // We can do it indirectly. We know how many bytes in total + // we have (length). Let us assume that the length covers + // only complete sequences (we need to adjust otherwise). + // We have that + // length = 4 * n4 + 3 * n3 + 2 * n2 + n1 + // where n1 is the number of 1-byte sequences (ASCII), + // n2 is the number of 2-byte sequences, n3 is the number + // of 3-byte sequences, and n4 is the number of 4-byte sequences. + // + // Let ncon be the number of continuation bytes, then we have + // length = n4 + n3 + n2 + ncon + n1 + // + // We can solve for n2 and n3 in terms of the other variables: + // n3 = n1 - 2 * n4 + 2 * ncon - length + // n2 = -2 * n1 + n4 - 4 * ncon + 2 * length + // Thus we only need to count the number of continuation bytes, + // the number of ASCII bytes and the number of 4-byte sequences. + // But we need even less because we compute + // utfadjust = -2 * n4 - 2 * n3 - n2 + // so n1 and length cancel out in the end. Thus we only need to compute + // n3' = - 2 * n4 + 2 * ncon + // n2' = n4 - 4 * ncon + // The *block* here is what begins at processedLength and ends + // at processedLength/16*16 or when an error occurs. // The block goes from processedLength to processedLength/16*16. int countBytes = 0; // number of continuation bytes in the block int n4 = 0; // number of 4-byte sequences that start in this block - for (; processedLength + 64 <= inputLength; processedLength += 64) + for (; processedLength + 32 <= inputLength; processedLength += 32) { - Vector512 currentBlock = Vector512.Load(pInputBuffer + processedLength); - ulong mask = currentBlock.ExtractMostSignificantBits(); + Vector256 currentBlock = Vector256.Load(pInputBuffer + processedLength); + int mask = (int)currentBlock.ExtractMostSignificantBits(); if (mask == 0) { // We have an ASCII block, no need to process it, but // we need to check if the previous block was incomplete. - if (Vector512.GreaterThan(prevIncomplete, Vector512.Zero).ExtractMostSignificantBits() != 0) + if (!Avx.TestZ(prevIncomplete, prevIncomplete)) { - byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(32 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); // So the code is correct up to invalidBytePointer if (invalidBytePointer < pInputBuffer + processedLength) + { RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref countBytes); + } else + { AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); + } (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return invalidBytePointer; } - - prevIncomplete = Vector512.Zero; + prevIncomplete = Vector256.Zero; // Often, we have a lot of ASCII characters in a row. - int localAsciiRun = 64; + int localAsciiRun = 32; if (processedLength + localAsciiRun + 64 <= inputLength) { for (; processedLength + localAsciiRun + 64 <= inputLength; localAsciiRun += 64) { - Vector512 block = Vector512.Load(pInputBuffer + processedLength + localAsciiRun); - if (block.ExtractMostSignificantBits() != 0) + Vector256 block1 = Vector256.Load(pInputBuffer + processedLength + localAsciiRun); + Vector256 block2 = Vector256.Load(pInputBuffer + processedLength + localAsciiRun + 32); + if ((block1 | block2).ExtractMostSignificantBits() != 0) + { break; + } } - processedLength += localAsciiRun - 64; + processedLength += localAsciiRun - 32; } + } else // Contains non-ASCII characters, we need to do non-trivial processing { // Use SubtractSaturate to effectively compare if bytes in block are greater than markers. - Vector512 moveMask = Vector512.Create(28, 29, 30, 31, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11); - Vector512 shuffled = Avx512F.PermuteVar16x32x2(currentBlock.AsInt32(), moveMask, prevInputBlock.AsInt32()).AsByte(); + Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; - - Vector512 prev1 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 1); - - // takes the XXXX 0000 part of the previous byte - Vector512 byte1High = Avx512BW.Shuffle(shuf1, Avx512BW.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); - Vector512 byte1Low = Avx512BW.Shuffle(shuf2, prev1 & v0f); // takes the 0000 XXXX part of the previous part - - // takes the XXXX 0000 part of the current byte - Vector512 byte2High = Avx512BW.Shuffle(shuf3, Avx512BW.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); - Vector512 sc = byte1High & byte1Low & byte2High; - Vector512 prev2 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 2); - Vector512 prev3 = Avx512BW.AlignRight(prevInputBlock, shuffled, 16 - 3); - Vector512 isThirdByte = Avx512BW.SubtractSaturate(prev2, thirdByte); - Vector512 isFourthByte = Avx512BW.SubtractSaturate(prev3, fourthByte); - Vector512 must23 = isThirdByte | isFourthByte; - Vector512 must23As80 = must23 & v80; - Vector512 error = must23As80 ^ sc; - - if (Vector512.GreaterThan(error, Vector512.Zero).ExtractMostSignificantBits() != 0) + Vector256 prev1 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 1); + Vector256 byte1High = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);// takes the XXXX 0000 part of the previous byte + Vector256 byte1Low = Avx2.Shuffle(shuf2, prev1 & v0f); // takes the 0000 XXXX part of the previous part + Vector256 byte2High = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); // takes the XXXX 0000 part of the current byte + Vector256 sc = byte1High & byte1Low & byte2High; + Vector256 prev2 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 2); + Vector256 prev3 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 3); + Vector256 isThirdByte = Avx2.SubtractSaturate(prev2, thirdByte); + Vector256 isFourthByte = Avx2.SubtractSaturate(prev3, fourthByte); + Vector256 must23 = isThirdByte | isFourthByte; + Vector256 must23As80 = must23 & v80; + Vector256 error = must23As80 ^ sc; + + if (!Avx.TestZ(error, error)) { - byte* invalidBytePointer; - if (processedLength == 0) - invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength); - else - invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + byte* invalidBytePointer = processedLength == 0 ? + SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength) : + SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + if (invalidBytePointer < pInputBuffer + processedLength) + { RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref countBytes); + } else + { AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); - (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = (n4, countBytes); + } + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return invalidBytePointer; } - prevIncomplete = Avx512BW.SubtractSaturate(currentBlock, maxValue); + prevIncomplete = Avx2.SubtractSaturate(currentBlock, maxValue); countBytes += BitOperations.PopCount(byte2High.ExtractMostSignificantBits()); - // We use two instructions (SubtractSaturate and ExtractMostSignificantBits) to update n4, with one arithmetic operation. - n4 += BitOperations.PopCount(Avx512BW.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits()); + // We use two instructions (SubtractSaturate and MoveMask) to update n4, with one arithmetic operation. + n4 += BitOperations.PopCount(Avx2.SubtractSaturate(currentBlock, fourthByte).ExtractMostSignificantBits()); } } - // We may still have an error. - bool hasIncomplete = Vector512.GreaterThan(prevIncomplete, Vector512.Zero).ExtractMostSignificantBits() != 0; + bool hasIncomplete = !Avx.TestZ(prevIncomplete, prevIncomplete); if (processedLength < inputLength || hasIncomplete) { byte* invalidBytePointer; if (processedLength == 0 || !hasIncomplete) + { invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength); + } else + { invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + } if (invalidBytePointer != pInputBuffer + inputLength) { if (invalidBytePointer < pInputBuffer + processedLength) + { RemoveCounters(invalidBytePointer, pInputBuffer + processedLength, ref n4, ref countBytes); + } else + { AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); + } (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return invalidBytePointer; } AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); } - (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return pInputBuffer + inputLength; } From 83e78920b4a12041fe3140b80f41e52f748adee3 Mon Sep 17 00:00:00 2001 From: EgorBo Date: Tue, 2 Jul 2024 19:53:57 +0200 Subject: [PATCH 3/4] Code clean up --- .../Text/Unicode/Utf8Utility.Validation.cs | 235 +++++++++++------- 1 file changed, 152 insertions(+), 83 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index 1a71c88be78fd..d70b870a9b9db 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -787,60 +787,66 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit Vector256 maxValue = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0b11110000 - 1, 0b11100000 - 1, 0b11000000 - 1); + Vector256 prevIncomplete = Avx2.SubtractSaturate(prevInputBlock, maxValue); - Vector256 shuf1 = Vector256.Create(tooLong, tooLong, tooLong, tooLong, - tooLong, tooLong, tooLong, tooLong, - twoConts, twoConts, twoConts, twoConts, - tooShort | overlong2, - tooShort, - tooShort | overlong3 | surrogate, - tooShort | tooLarge | tooLarge1000 | overlong4, - tooLong, tooLong, tooLong, tooLong, - tooLong, tooLong, tooLong, tooLong, - twoConts, twoConts, twoConts, twoConts, - tooShort | overlong2, - tooShort, - tooShort | overlong3 | surrogate, - tooShort | tooLarge | tooLarge1000 | overlong4); - Vector256 shuf2 = Vector256.Create(carry | overlong3 | overlong2 | overlong4, - carry | overlong2, - carry, carry, - carry | tooLarge, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000 | surrogate, - carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, - carry | overlong3 | overlong2 | overlong4, - carry | overlong2, - carry, carry, - carry | tooLarge, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000 | surrogate, - carry | tooLarge | tooLarge1000, - carry | tooLarge | tooLarge1000); - Vector256 shuf3 = Vector256.Create(tooShort, tooShort, tooShort, tooShort, - tooShort, tooShort, tooShort, tooShort, - tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, - tooLong | overlong2 | twoConts | overlong3 | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, - tooShort, tooShort, tooShort, tooShort, - tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, - tooLong | overlong2 | twoConts | overlong3 | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooLong | overlong2 | twoConts | surrogate | tooLarge, - tooShort, tooShort, tooShort, tooShort); + Vector256 shuf1 = Vector256.Create( + tooLong, tooLong, tooLong, tooLong, tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4, + tooLong, tooLong, tooLong, tooLong, tooLong, tooLong, tooLong, tooLong, + twoConts, twoConts, twoConts, twoConts, + tooShort | overlong2, + tooShort, + tooShort | overlong3 | surrogate, + tooShort | tooLarge | tooLarge1000 | overlong4); + + Vector256 shuf2 = Vector256.Create( + carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, + carry | overlong3 | overlong2 | overlong4, + carry | overlong2, + carry, carry, + carry | tooLarge, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000, carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000 | surrogate, + carry | tooLarge | tooLarge1000, + carry | tooLarge | tooLarge1000); + + Vector256 shuf3 = Vector256.Create( + tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, tooShort, + tooShort, tooShort, tooShort, tooShort, + tooLong | overlong2 | twoConts | overlong3 | tooLarge1000 | overlong4, + tooLong | overlong2 | twoConts | overlong3 | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooLong | overlong2 | twoConts | surrogate | tooLarge, + tooShort, tooShort, tooShort, tooShort); + Vector256 thirdByte = Vector256.Create((byte)(0b11100000u - 0x80)); Vector256 fourthByte = Vector256.Create((byte)(0b11110000u - 0x80)); Vector256 v0f = Vector256.Create((byte)0x0F); Vector256 v80 = Vector256.Create((byte)0x80); + // So we want to count the number of 4-byte sequences, // the number of 4-byte sequences, 3-byte sequences, and // the number of 2-byte sequences. @@ -869,9 +875,10 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit // The *block* here is what begins at processedLength and ends // at processedLength/16*16 or when an error occurs. // The block goes from processedLength to processedLength/16*16. + int countBytes = 0; // number of continuation bytes in the block int n4 = 0; // number of 4-byte sequences that start in this block - for (; processedLength + 32 <= inputLength; processedLength += 32) + for (; processedLength + Vector256.Count <= inputLength; processedLength += Vector256.Count) { Vector256 currentBlock = Vector256.Load(pInputBuffer + processedLength); int mask = (int)currentBlock.ExtractMostSignificantBits(); @@ -881,7 +888,8 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit // we need to check if the previous block was incomplete. if (!Avx.TestZ(prevIncomplete, prevIncomplete)) { - byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(32 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + byte* invalidBytePointer = SimpleRewindAndValidateWithErrors(Vector256.Count - 3, + pInputBuffer + processedLength - 3, inputLength - processedLength + 3); // So the code is correct up to invalidBytePointer if (invalidBytePointer < pInputBuffer + processedLength) { @@ -891,25 +899,26 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit { AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); } + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return invalidBytePointer; } prevIncomplete = Vector256.Zero; // Often, we have a lot of ASCII characters in a row. - int localAsciiRun = 32; - if (processedLength + localAsciiRun + 64 <= inputLength) + int localAsciiRun = Vector256.Count; + if (processedLength + localAsciiRun + (Vector256.Count * 2) <= inputLength) { - for (; processedLength + localAsciiRun + 64 <= inputLength; localAsciiRun += 64) + for (; processedLength + localAsciiRun + (Vector256.Count * 2) <= inputLength; localAsciiRun += (Vector256.Count * 2)) { Vector256 block1 = Vector256.Load(pInputBuffer + processedLength + localAsciiRun); - Vector256 block2 = Vector256.Load(pInputBuffer + processedLength + localAsciiRun + 32); + Vector256 block2 = Vector256.Load(pInputBuffer + processedLength + localAsciiRun + Vector256.Count); if ((block1 | block2).ExtractMostSignificantBits() != 0) { break; } } - processedLength += localAsciiRun - 32; + processedLength += localAsciiRun - Vector256.Count; } } @@ -919,9 +928,15 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit Vector256 shuffled = Avx2.Permute2x128(prevInputBlock, currentBlock, 0x21); prevInputBlock = currentBlock; Vector256 prev1 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 1); - Vector256 byte1High = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f);// takes the XXXX 0000 part of the previous byte - Vector256 byte1Low = Avx2.Shuffle(shuf2, prev1 & v0f); // takes the 0000 XXXX part of the previous part - Vector256 byte2High = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); // takes the XXXX 0000 part of the current byte + + // Takes the XXXX 0000 part of the previous byte + Vector256 byte1High = Avx2.Shuffle(shuf1, Avx2.ShiftRightLogical(prev1.AsUInt16(), 4).AsByte() & v0f); + + // Takes the 0000 XXXX part of the previous part + Vector256 byte1Low = Avx2.Shuffle(shuf2, prev1 & v0f); + + // Takes the XXXX 0000 part of the current byte + Vector256 byte2High = Avx2.Shuffle(shuf3, Avx2.ShiftRightLogical(currentBlock.AsUInt16(), 4).AsByte() & v0f); Vector256 sc = byte1High & byte1Low & byte2High; Vector256 prev2 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 2); Vector256 prev3 = Avx2.AlignRight(prevInputBlock, shuffled, 16 - 3); @@ -945,6 +960,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit { AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); } + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return invalidBytePointer; } @@ -962,11 +978,13 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit byte* invalidBytePointer; if (processedLength == 0 || !hasIncomplete) { - invalidBytePointer = SimpleRewindAndValidateWithErrors(0, pInputBuffer + processedLength, inputLength - processedLength); + invalidBytePointer = SimpleRewindAndValidateWithErrors( + 0, pInputBuffer + processedLength, inputLength - processedLength); } else { - invalidBytePointer = SimpleRewindAndValidateWithErrors(3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); + invalidBytePointer = SimpleRewindAndValidateWithErrors( + 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3); } if (invalidBytePointer != pInputBuffer + inputLength) { @@ -978,11 +996,13 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit { AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); } + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return invalidBytePointer; } AddCounters(pInputBuffer + processedLength, invalidBytePointer, ref n4, ref countBytes); } + (utf16CodeUnitCountAdjustment, scalarCountAdjustment) = CalculateN2N3FinalSimdAdjustments(n4, countBytes); return pInputBuffer + inputLength; } @@ -993,9 +1013,13 @@ private static void RemoveCounters(byte* start, byte* end, ref int n4, ref int c for (byte* p = start; p < end; p++) { if ((*p & 0b11000000) == 0b10000000) + { countBytes -= 1; + } if ((*p & 0b11110000) == 0b11110000) + { n4 -= 1; + } } } @@ -1005,17 +1029,22 @@ private static void AddCounters(byte* start, byte* end, ref int n4, ref int coun for (byte* p = start; p < end; p++) { if ((*p & 0b11000000) == 0b10000000) + { countBytes += 1; + } if ((*p & 0b11110000) == 0b11110000) + { n4 += 1; + } } } - // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of - // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the - // pointer to the first invalid byte. private static byte* SimpleRewindAndValidateWithErrors(int howFarBack, byte* buf, int len) { + // We scan the input from buf to len, possibly going back howFarBack bytes, to find the end of + // a valid UTF-8 sequence. We return buf + len if the buffer is valid, otherwise we return the + // pointer to the first invalid byte. + int extraLen = 0; bool foundLeadingBytes = false; @@ -1032,7 +1061,9 @@ private static void AddCounters(byte* start, byte* end, ref int n4, ref int coun } if (!foundLeadingBytes) + { return buf - howFarBack; + } int pos = 0; len += extraLen; @@ -1042,7 +1073,9 @@ private static void AddCounters(byte* start, byte* end, ref int n4, ref int coun while (firstByte < 0b10000000) { if (++pos == len) + { return buf + len; + } firstByte = buf[pos]; } @@ -1052,56 +1085,92 @@ private static void AddCounters(byte* start, byte* end, ref int n4, ref int coun { nextPos = pos + 2; if (nextPos > len) - return buf + pos; // Too short + { + // Too short + return buf + pos; + } if ((buf[pos + 1] & 0b11000000) != 0b10000000) - return buf + pos; // Too short + { + // Too short + return buf + pos; + } - // range check - codePoint = ((uint)(firstByte & 0b00011111) << 6) | (uint)(buf[pos + 1] & 0b00111111); - if (codePoint < 0x80 || 0x7ff < codePoint) - return buf + pos; // Overlong + // Range check + codePoint = ((uint)(firstByte & 0b00011111) << 6) | + ((uint)(buf[pos + 1] & 0b00111111)); + if (codePoint is < 0x80 or > 0x7ff) + { + // Overlong + return buf + pos; + } } else if ((firstByte & 0b11110000) == 0b11100000) { nextPos = pos + 3; if (nextPos > len) - return buf + pos; // Too short - - // range check - codePoint = ((uint)(firstByte & 0b00001111) << 12) | ((uint)(buf[pos + 1] & 0b00111111) << 6) | (uint)(buf[pos + 2] & 0b00111111); + { + // Too short + return buf + pos; + } + // Range check + codePoint = ((uint)(firstByte & 0b00001111) << 12) | + ((uint)(buf[pos + 1] & 0b00111111) << 6) | + ((uint)(buf[pos + 2] & 0b00111111)); // Either overlong or too large: - if (codePoint < 0x800 || 0xffff < codePoint || (0xd7ff < codePoint && codePoint < 0xe000)) + if (codePoint is < 0x800 or > 0xffff or > 0xd7ff and < 0xe000) + { return buf + pos; + } if ((buf[pos + 1] & 0b11000000) != 0b10000000) - return buf + pos; // Too short + { + // Too short + return buf + pos; + } if ((buf[pos + 2] & 0b11000000) != 0b10000000) - return buf + pos; // Too short + { + // Too short + return buf + pos; + } } else if ((firstByte & 0b11111000) == 0b11110000) { nextPos = pos + 4; if (nextPos > len) + { return buf + pos; + } if ((buf[pos + 1] & 0b11000000) != 0b10000000) + { return buf + pos; + } if ((buf[pos + 2] & 0b11000000) != 0b10000000) + { return buf + pos; + } if ((buf[pos + 3] & 0b11000000) != 0b10000000) + { return buf + pos; - // range check - codePoint = ((uint)(firstByte & 0b00000111) << 18) | ((uint)(buf[pos + 1] & 0b00111111) << 12) | - ((uint)(buf[pos + 2] & 0b00111111) << 6) | (uint)(buf[pos + 3] & 0b00111111); - if (codePoint <= 0xffff || 0x10ffff < codePoint) + } + + // Range check + codePoint = ((uint)(firstByte & 0b00000111) << 18) | + ((uint)(buf[pos + 1] & 0b00111111) << 12) | + ((uint)(buf[pos + 2] & 0b00111111) << 6) | + ((uint)(buf[pos + 3] & 0b00111111)); + if (codePoint is <= 0xffff or > 0x10ffff) + { return buf + pos; + } } else { - // we may have a continuation/too long error + // We may have a continuation/too long error return buf + pos; } pos = nextPos; } - return buf + len; // no error + // No error + return buf + len; } [MethodImpl(MethodImplOptions.AggressiveInlining)] From 6aea9c33b7ec71a9dc67a94fc7100419f8ecd793 Mon Sep 17 00:00:00 2001 From: Egor Bogatov Date: Tue, 2 Jul 2024 20:41:49 +0200 Subject: [PATCH 4/4] Apply suggestions from code review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Günther Foidl --- .../src/System/Text/Unicode/Utf8Utility.Validation.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs index d70b870a9b9db..edf2d8e607e32 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Text/Unicode/Utf8Utility.Validation.cs @@ -878,7 +878,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit int countBytes = 0; // number of continuation bytes in the block int n4 = 0; // number of 4-byte sequences that start in this block - for (; processedLength + Vector256.Count <= inputLength; processedLength += Vector256.Count) + for (; processedLength <= inputLength - Vector256.Count; processedLength += Vector256.Count) { Vector256 currentBlock = Vector256.Load(pInputBuffer + processedLength); int mask = (int)currentBlock.ExtractMostSignificantBits(); @@ -909,7 +909,7 @@ private static ulong GetNonAsciiBytes(Vector128 value, Vector128 bit int localAsciiRun = Vector256.Count; if (processedLength + localAsciiRun + (Vector256.Count * 2) <= inputLength) { - for (; processedLength + localAsciiRun + (Vector256.Count * 2) <= inputLength; localAsciiRun += (Vector256.Count * 2)) + for (; localAsciiRun <= inputLength - processedLength - (Vector256.Count * 2); localAsciiRun += (Vector256.Count * 2)) { Vector256 block1 = Vector256.Load(pInputBuffer + processedLength + localAsciiRun); Vector256 block2 = Vector256.Load(pInputBuffer + processedLength + localAsciiRun + Vector256.Count);