Delete redundant vectorization code paths #49846

stephentoub · 2021-03-19T01:00:23Z

We have a non-trivial number of implementations that duplicate loops across multiple hardware instruction sets plus a generalized Vector path, e.g.

runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines 266 to 434 in 9f93bcb

    
           if (Avx2.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   if ((((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0) 
        
                   { 
        
                       // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches 
        
                       // with no upper bound e.g. String.strlen. 
        
                       // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. 
        
                       // This ensures we do not fault across memory pages while searching for an end of string. 
        
                       Vector128<byte> values = Vector128.Create(value); 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as below 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                       } 
        
                       else 
        
                       { 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } 
        
                   } 
        
                   lengthToExamine = GetByteVector256SpanLength(offset, length); 
        
                   if (lengthToExamine > offset) 
        
                   { 
        
                       Vector256<byte> values = Vector256.Create(value); 
        
                       do 
        
                       { 
        
                           Vector256<byte> search = LoadVector256(ref searchSpace, offset); 
        
                           int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search)); 
        
                           // Note that MoveMask has converted the equal vector elements into a set of bit flags, 
        
                           // So the bit position in 'matches' corresponds to the element offset. 
        
                           if (matches == 0) 
        
                           { 
        
                               // Zero flags set so no matches 
        
                               offset += (nuint)Vector256<byte>.Count; 
        
                               continue; 
        
                           } 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } while (lengthToExamine > offset); 
        
                   } 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   if (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> values = Vector128.Create(value); 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as above 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                       } 
        
                       else 
        
                       { 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (Sse2.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   Vector128<byte> values = Vector128.Create(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as above 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       // Find bitflag offset of first match and add to current offset 
        
                       return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (AdvSimd.Arm64.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   // Mask to help find the first lane in compareResult that is set. 
        
                   // MSB 0x10 corresponds to 1st lane, 0x01 corresponds to 0th lane and so forth. 
        
                   Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte(); 
        
                   int matchedLane = 0; 
        
                   Vector128<byte> values = Vector128.Create(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       Vector128<byte> compareResult = AdvSimd.CompareEqual(values, search); 
        
                       if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       return (int)(offset + (uint)matchedLane); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (Vector.IsHardwareAccelerated) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVectorSpanLength(offset, length); 
        
                   Vector<byte> values = new Vector<byte>(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); 
        
                       if (Vector<byte>.Zero.Equals(matches)) 
        
                       { 
        
                           offset += (nuint)Vector<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       // Find offset of first match and add to current offset 
        
                       return (int)offset + LocateFirstFoundByte(matches); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           }

has a structure that's approximately:

if (Avx2.IsSupported)
{
    ... // use Avx2
}
else if (Sse2.IsSupported)
{
    ... // use Sse2
}
else if (AdvSimd.Arm64.IsSupported)
{
    ... // use AdvSimd
}
else if (Vector.IsHardwareAccelerated)
{
    ... // use Vector / Vector<T>
}

We have others like:

runtime/src/libraries/System.Collections/src/System/Collections/BitArray.cs

Lines 152 to 224 in 9f93bcb

    
           if (Avx2.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               Vector256<byte> zero = Vector256<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount) 
        
                   { 
        
                       Vector256<byte> vector = Avx.LoadVector256((byte*)ptr + i); 
        
                       Vector256<byte> isFalse = Avx2.CompareEqual(vector, zero); 
        
                       int result = Avx2.MoveMask(isFalse); 
        
                       m_array[i / 32u] = ~result; 
        
                   } 
        
               } 
        
           } 
        
           else if (Sse2.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               Vector128<byte> zero = Vector128<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) 
        
                   { 
        
                       Vector128<byte> lowerVector = Sse2.LoadVector128((byte*)ptr + i); 
        
                       Vector128<byte> lowerIsFalse = Sse2.CompareEqual(lowerVector, zero); 
        
                       int lowerPackedIsFalse = Sse2.MoveMask(lowerIsFalse); 
        
                       Vector128<byte> upperVector = Sse2.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); 
        
                       Vector128<byte> upperIsFalse = Sse2.CompareEqual(upperVector, zero); 
        
                       int upperPackedIsFalse = Sse2.MoveMask(upperIsFalse); 
        
                       m_array[i / 32u] = ~((upperPackedIsFalse << 16) | lowerPackedIsFalse); 
        
                   } 
        
               } 
        
           } 
        
           else if (AdvSimd.Arm64.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               // However comparison against zero can be replaced to cmeq against zero (vceqzq_s8) 
        
               // See dotnet/runtime#33972 for details 
        
               Vector128<byte> zero = Vector128<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) 
        
                   { 
        
                       // Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction 
        
                       // As a workaround, mask out the relevant bit after comparison 
        
                       // and combine by ORing all of them together (In this case, adding all of them does the same thing) 
        
                       Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i); 
        
                       Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero); 
        
                       Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16(); 
        
                       Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); 
        
                       Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero); 
        
                       Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       Vector128<short> upperPackedIsFalse = bitsExtracted2.AsInt16(); 
        
                       int result = AdvSimd.Arm64.ZipLow(lowerPackedIsFalse, upperPackedIsFalse).AsInt32().ToScalar(); 
        
                       if (!BitConverter.IsLittleEndian) 
        
                       { 
        
                           result = BinaryPrimitives.ReverseEndianness(result); 
        
                       } 
        
                       m_array[i / 32u] = ~result; 
        
                   } 
        
               } 
        
           }

that similarly have paths for Avx2, Sse2, and AdvSimd but without a Vector path.

This is a lot of complicated code being duplicated, and it should no longer be necessary. In most cases, the path using Vector<T> should produce code identical to what we'd otherwise write manually using hardware intrinsics, automatically picking 32-bytes or 16-bytes based on what the hardware supports. So, for any of these cases that do Avx2/Sse2/AdvSimd/Vector, we should theoretically be able to simply delete the Avx2/Sse2/AdvSimd code paths, leaving only the Vector code path, because if the Avx2 path would be supported, then Vector would similarly use 32-byte widths, and if Avx2 wouldn't be supported but Sse2/AdvSimd would, Vector would similarly use 16-byte widths. And for the cases that today use Avx2/Sse2/AdvSimd without a Vector path, we should be able to add a path for Vector and then delete the others, for the same reason.

For paths that don't have an Avx2 option, and instead just have paths for Sse2/AdvSimd/Vector, it's less clear whether we could delete the Sse2/AdvSimd implementations, as it's possible there could be performance differences (for better or worse) if the Vector path ended up using 32-byte widths.

cc: @tannergooding, @benaadams

The text was updated successfully, but these errors were encountered:

ghost · 2021-03-19T01:00:54Z

Tagging subscribers to this area: @tannergooding
See info in area-owners.md if you want to be subscribed.

Issue Details

We have a non-trivial number of implementations that duplicate loops across multiple hardware instruction sets plus a generalized Vector path, e.g.

runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines 266 to 434 in 9f93bcb

    
           if (Avx2.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   if ((((nuint)(uint)Unsafe.AsPointer(ref searchSpace) + offset) & (nuint)(Vector256<byte>.Count - 1)) != 0) 
        
                   { 
        
                       // Not currently aligned to Vector256 (is aligned to Vector128); this can cause a problem for searches 
        
                       // with no upper bound e.g. String.strlen. 
        
                       // Start with a check on Vector128 to align to Vector256, before moving to processing Vector256. 
        
                       // This ensures we do not fault across memory pages while searching for an end of string. 
        
                       Vector128<byte> values = Vector128.Create(value); 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as below 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                       } 
        
                       else 
        
                       { 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } 
        
                   } 
        
                   lengthToExamine = GetByteVector256SpanLength(offset, length); 
        
                   if (lengthToExamine > offset) 
        
                   { 
        
                       Vector256<byte> values = Vector256.Create(value); 
        
                       do 
        
                       { 
        
                           Vector256<byte> search = LoadVector256(ref searchSpace, offset); 
        
                           int matches = Avx2.MoveMask(Avx2.CompareEqual(values, search)); 
        
                           // Note that MoveMask has converted the equal vector elements into a set of bit flags, 
        
                           // So the bit position in 'matches' corresponds to the element offset. 
        
                           if (matches == 0) 
        
                           { 
        
                               // Zero flags set so no matches 
        
                               offset += (nuint)Vector256<byte>.Count; 
        
                               continue; 
        
                           } 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } while (lengthToExamine > offset); 
        
                   } 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   if (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> values = Vector128.Create(value); 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as above 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                       } 
        
                       else 
        
                       { 
        
                           // Find bitflag offset of first match and add to current offset 
        
                           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                       } 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (Sse2.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   Vector128<byte> values = Vector128.Create(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       // Same method as above 
        
                       int matches = Sse2.MoveMask(Sse2.CompareEqual(values, search)); 
        
                       if (matches == 0) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       // Find bitflag offset of first match and add to current offset 
        
                       return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches)); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (AdvSimd.Arm64.IsSupported) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVector128SpanLength(offset, length); 
        
                   // Mask to help find the first lane in compareResult that is set. 
        
                   // MSB 0x10 corresponds to 1st lane, 0x01 corresponds to 0th lane and so forth. 
        
                   Vector128<byte> mask = Vector128.Create((ushort)0x1001).AsByte(); 
        
                   int matchedLane = 0; 
        
                   Vector128<byte> values = Vector128.Create(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       Vector128<byte> search = LoadVector128(ref searchSpace, offset); 
        
                       Vector128<byte> compareResult = AdvSimd.CompareEqual(values, search); 
        
                       if (!TryFindFirstMatchedLane(mask, compareResult, ref matchedLane)) 
        
                       { 
        
                           // Zero flags set so no matches 
        
                           offset += (nuint)Vector128<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       return (int)(offset + (uint)matchedLane); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           } 
        
           else if (Vector.IsHardwareAccelerated) 
        
           { 
        
               if (offset < (nuint)(uint)length) 
        
               { 
        
                   lengthToExamine = GetByteVectorSpanLength(offset, length); 
        
                   Vector<byte> values = new Vector<byte>(value); 
        
                   while (lengthToExamine > offset) 
        
                   { 
        
                       var matches = Vector.Equals(values, LoadVector(ref searchSpace, offset)); 
        
                       if (Vector<byte>.Zero.Equals(matches)) 
        
                       { 
        
                           offset += (nuint)Vector<byte>.Count; 
        
                           continue; 
        
                       } 
        
                       // Find offset of first match and add to current offset 
        
                       return (int)offset + LocateFirstFoundByte(matches); 
        
                   } 
        
                   if (offset < (nuint)(uint)length) 
        
                   { 
        
                       lengthToExamine = ((nuint)(uint)length - offset); 
        
                       goto SequentialScan; 
        
                   } 
        
               } 
        
           }

has a structure that's approximately:

if (Avx2.IsSupported)
{
    ... // use Avx2
}
else if (Sse2.IsSupported)
{
    ... // use Sse2
}
else if (AdvSimd.Arm64.IsSupported)
{
    ... // use AdvSimd
}
else if (Vector.IsHardwareAccelerated)
{
    ... // use Vector / Vector<T>
}

We have others like:

runtime/src/libraries/System.Collections/src/System/Collections/BitArray.cs

Lines 152 to 224 in 9f93bcb

    
           if (Avx2.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               Vector256<byte> zero = Vector256<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector256ByteCount) <= (uint)values.Length; i += Vector256ByteCount) 
        
                   { 
        
                       Vector256<byte> vector = Avx.LoadVector256((byte*)ptr + i); 
        
                       Vector256<byte> isFalse = Avx2.CompareEqual(vector, zero); 
        
                       int result = Avx2.MoveMask(isFalse); 
        
                       m_array[i / 32u] = ~result; 
        
                   } 
        
               } 
        
           } 
        
           else if (Sse2.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               Vector128<byte> zero = Vector128<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) 
        
                   { 
        
                       Vector128<byte> lowerVector = Sse2.LoadVector128((byte*)ptr + i); 
        
                       Vector128<byte> lowerIsFalse = Sse2.CompareEqual(lowerVector, zero); 
        
                       int lowerPackedIsFalse = Sse2.MoveMask(lowerIsFalse); 
        
                       Vector128<byte> upperVector = Sse2.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); 
        
                       Vector128<byte> upperIsFalse = Sse2.CompareEqual(upperVector, zero); 
        
                       int upperPackedIsFalse = Sse2.MoveMask(upperIsFalse); 
        
                       m_array[i / 32u] = ~((upperPackedIsFalse << 16) | lowerPackedIsFalse); 
        
                   } 
        
               } 
        
           } 
        
           else if (AdvSimd.Arm64.IsSupported) 
        
           { 
        
               // JIT does not support code hoisting for SIMD yet 
        
               // However comparison against zero can be replaced to cmeq against zero (vceqzq_s8) 
        
               // See dotnet/runtime#33972 for details 
        
               Vector128<byte> zero = Vector128<byte>.Zero; 
        
               fixed (bool* ptr = values) 
        
               { 
        
                   for (; (i + Vector128ByteCount * 2u) <= (uint)values.Length; i += Vector128ByteCount * 2u) 
        
                   { 
        
                       // Same logic as SSE2 path, however we lack MoveMask (equivalent) instruction 
        
                       // As a workaround, mask out the relevant bit after comparison 
        
                       // and combine by ORing all of them together (In this case, adding all of them does the same thing) 
        
                       Vector128<byte> lowerVector = AdvSimd.LoadVector128((byte*)ptr + i); 
        
                       Vector128<byte> lowerIsFalse = AdvSimd.CompareEqual(lowerVector, zero); 
        
                       Vector128<byte> bitsExtracted1 = AdvSimd.And(lowerIsFalse, s_bitMask128); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       bitsExtracted1 = AdvSimd.Arm64.AddPairwise(bitsExtracted1, bitsExtracted1); 
        
                       Vector128<short> lowerPackedIsFalse = bitsExtracted1.AsInt16(); 
        
                       Vector128<byte> upperVector = AdvSimd.LoadVector128((byte*)ptr + i + Vector128<byte>.Count); 
        
                       Vector128<byte> upperIsFalse = AdvSimd.CompareEqual(upperVector, zero); 
        
                       Vector128<byte> bitsExtracted2 = AdvSimd.And(upperIsFalse, s_bitMask128); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       bitsExtracted2 = AdvSimd.Arm64.AddPairwise(bitsExtracted2, bitsExtracted2); 
        
                       Vector128<short> upperPackedIsFalse = bitsExtracted2.AsInt16(); 
        
                       int result = AdvSimd.Arm64.ZipLow(lowerPackedIsFalse, upperPackedIsFalse).AsInt32().ToScalar(); 
        
                       if (!BitConverter.IsLittleEndian) 
        
                       { 
        
                           result = BinaryPrimitives.ReverseEndianness(result); 
        
                       } 
        
                       m_array[i / 32u] = ~result; 
        
                   } 
        
               } 
        
           }

that similarly have paths for Avx2, Sse2, and AdvSimd but without a Vector path.

This is a lot of complicated code being duplicated, and it should no longer be necessary. In most cases, the path using Vector<T> should produce code identical to what we'd otherwise write manually using hardware intrinsics, automatically picking 32-bytes or 16-bytes based on what the hardware supports. So, for any of these cases that do Avx2/Sse2/AdvSimd/Vector, we should theoretically be able to simply delete the Avx2/Sse2/AdvSimd code paths, leaving only the Vector code path, because if the Avx2 path would be supported, then Vector would similarly use 32-byte widths, and if Avx2 wouldn't be supported but Sse2/AdvSimd would, Vector would similarly use 16-byte widths. And for the cases that today use Avx2/Sse2/AdvSimd without a Vector path, we should be able to add a path for Vector and then delete the others, for the same reason.

For paths that don't have an Avx2 option, and instead just have paths for Sse2/AdvSimd/Vector, it's less clear whether we could delete the Sse2/AdvSimd implementations, as it's possible there could be performance differences (for better or worse) if the Vector path ended up using 32-byte widths.

cc: @tannergooding, @benaadams

Author:	stephentoub
Assignees:	-
Labels:	`area-System.Runtime.Intrinsics`, `untriaged`
Milestone:	6.0.0

benaadams · 2021-03-19T01:12:30Z

They aren't completely the same with for example the Avx2 paths being more optimized than the Vector path (e.g. for determining the "found" index from the Vector) and also uses 16-byte and 32 byte vectors; so acceleration starts at 16 bytes or 8 chars; whereas the Vector path would always start at 32 bytes or 16 chars

stephentoub · 2021-03-19T01:15:41Z

What is the additional optimization in the BitArray example?

benaadams · 2021-03-19T01:33:25Z

What is the additional optimization in the BitArray example?

Looks like it needs work 😅

As an aside I was considering whether a Generator for all the SpanHelper methods would be interesting since they are all variations on a theme (as is the one in System.Text.Json #41097 (comment)) and additional variants like IndexNotEquals were rejected #28795 (comment) due to possible "infinite" apis; but that could be covered with a Generator.

Don't know if that would be better or worse?

Would be less code to look after, but also easier to generate more variants so more asm...

tannergooding · 2021-03-19T15:23:19Z

I think a generator for vectorized code seems like an overall interesting idea but also a complex one. The main issue I see is that we have 3-4 code paths that are all nearly identical. They often only differ in one or two places when they do differ at all.

The options for normalizing these are:

Wait for Expose cross-platform helpers for Vector64, Vector128, and Vector256 #49397 and normalize on Vector128<T> everywhere
Continue using Vector<T> and utilize the .AsVector256() and .AsVector128() intrinsics to handle platform specific ops over different Vector<T> sizes

Vector<T> is nice in that it is generic and allows you to write a single path supporting multiple platforms.

It is supported on full framework
As of .NET 5, most ops generate identical code to the hardware intrinsics as that is how they are internally implemented

It is not so nice in that it is variable sized and so it can be difficult to accurately know the benefit.

If the inputs are small, you are adding overhead for the vector handling and may frequently not use it.
If input is large, using Vector may be beneficial but this can be dependent on alignment of the data, operations being done, etc
Because it is variable sized, certain operations can't be easily represented (shuffling/permute is one example)
Larger vector sizes may impact things like battery life negatively
If we onboard additional vector sizes in the future (such as Vector512, or any of the SVE sizes) these issues may become more problematic

Vector128<T> is nice in that it is currently the cross-platform shared vector size.

It is fixed size and so conceptually allows more xplat ops
It is trivial to fallback to the relevant hardware intrinsics where shared xplat functionality isn't possible
It generally doesn't incur overhead or downclocking and so is "battery friendly"
If data is misaligned, cache line splits only occur every 4 reads, rather than every 2, which CPUs tend to handle better

It is not nice in that it may leave perf on the table when inputs are particularly large, but I feel this is algorithm specific and not the "default/expected"

benaadams · 2021-03-19T15:48:01Z

The options for normalizing these are

For span helpers while the "not here" are mostly the same (essentially .Equals) identifying a found item is very different; the intrinsic paths use the single instruction MoveMask with its return used both for equality and the value search TrailingZeroCount

runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines 1024 to 1035 in 9f93bcb

    
           int matches = Avx2.MoveMask(Avx2.Or(Avx2.Or(matches0, matches1), matches2)); 
        
           // Note that MoveMask has converted the equal vector elements into a set of bit flags, 
        
           // So the bit position in 'matches' corresponds to the element offset. 
        
           if (matches == 0) 
        
           { 
        
               // Zero flags set so no matches 
        
               offset += (nuint)Vector256<byte>.Count; 
        
               continue; 
        
           } 
        
           // Find bitflag offset of first match and add to current offset 
        
           return (int)(offset + (uint)BitOperations.TrailingZeroCount(matches));

whereas the Vector<T> path uses a loop over each vector element; which is also why the Jit special cases loop unrolling for i < Vector<ulong>.Count

Find

runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines 1173 to 1180 in 9f93bcb

    
           if (Vector<byte>.Zero.Equals(matches)) 
        
           { 
        
               offset += (nuint)Vector<byte>.Count; 
        
               continue; 
        
           } 
        
           // Find offset of first match and add to current offset 
        
           return (int)offset + LocateFirstFoundByte(matches);

Locate

runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines 1681 to 1698 in 9f93bcb

    
           private static int LocateFirstFoundByte(Vector<byte> match) 
        
           { 
        
               var vector64 = Vector.AsVectorUInt64(match); 
        
               ulong candidate = 0; 
        
               int i = 0; 
        
               // Pattern unrolled by jit https://github.com/dotnet/coreclr/pull/8001 
        
               for (; i < Vector<ulong>.Count; i++) 
        
               { 
        
                   candidate = vector64[i]; 
        
                   if (candidate != 0) 
        
                   { 
        
                       break; 
        
                   } 
        
               } 
        
               // Single LEA instruction with jitted const (using function result) 
        
               return i * 8 + LocateFirstFoundByte(candidate); 
        
           }

Isolate

runtime/src/libraries/System.Private.CoreLib/src/System/SpanHelpers.Byte.cs

Lines 1915 to 1916 in 9f93bcb

    
           private static int LocateFirstFoundByte(ulong match) 
        
               => BitOperations.TrailingZeroCount(match) >> 3;

However accessing each element of the vector individually to check them isn't great... Perhaps we need to extend the apis on the platform independent vectors?

tannergooding · 2021-03-19T16:06:43Z

Perhaps we need to extend the apis on the platform independent vectors?

The primary blocker here is that Vector<T> is variable sized. We cannot easily expose concepts like MoveMask because that may break future extensions. (e.g. SVE which can go up to 2048-bits)
Even thinking of AVX-512 support, it can become very limiting in what concepts you can expose for Vector<T> because you start requiring long or more to represent per element masks.

benaadams · 2021-03-29T06:37:48Z

It generally doesn't incur overhead or downclocking

Looks like Ice Lake wasn't downclocking much and Rocket Lake isn't for AVX-512 even when used on all cores? https://travisdowns.github.io/blog/2020/08/19/icl-avx512-freq.html#rocket-lake

tannergooding · 2021-03-29T15:30:30Z

I don't think we can rely on Ice Lake not downclocking as the basis for using or not using AVX-512. AVX-512 is relatively new still and it will likely be a while before it is commonplace enough to determine what the correct baseline is.

256-bit AVX instructions had similar issues when first released and they have improved overtime in a similar fashion.

tannergooding · 2021-06-17T23:27:09Z

This is something that will likely not make 6.0.0 but will be more easily resolvable once #49397 is in.

stephentoub · 2023-12-26T19:39:09Z

Closing as we're exploring other directions.

dotnet-issue-labeler bot added the untriaged New issue has not been triaged by the area owner label Mar 19, 2021

stephentoub added this to the 6.0.0 milestone Mar 19, 2021

stephentoub added the area-System.Runtime.Intrinsics label Mar 19, 2021

dotnet deleted a comment from dotnet-issue-labeler bot Mar 19, 2021

tannergooding removed the untriaged New issue has not been triaged by the area owner label Jun 17, 2021

tannergooding modified the milestones: 6.0.0, Future Jun 17, 2021

stephentoub closed this as completed Dec 26, 2023

github-actions bot locked and limited conversation to collaborators Jan 26, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Delete redundant vectorization code paths #49846

Delete redundant vectorization code paths #49846

stephentoub commented Mar 19, 2021

ghost commented Mar 19, 2021

benaadams commented Mar 19, 2021

stephentoub commented Mar 19, 2021

benaadams commented Mar 19, 2021 •

edited

Loading

tannergooding commented Mar 19, 2021 •

edited

Loading

benaadams commented Mar 19, 2021 •

edited

Loading

tannergooding commented Mar 19, 2021

benaadams commented Mar 29, 2021 •

edited

Loading

tannergooding commented Mar 29, 2021

tannergooding commented Jun 17, 2021

stephentoub commented Dec 26, 2023

Delete redundant vectorization code paths #49846

Delete redundant vectorization code paths #49846

Comments

stephentoub commented Mar 19, 2021

ghost commented Mar 19, 2021

benaadams commented Mar 19, 2021

stephentoub commented Mar 19, 2021

benaadams commented Mar 19, 2021 • edited Loading

tannergooding commented Mar 19, 2021 • edited Loading

benaadams commented Mar 19, 2021 • edited Loading

tannergooding commented Mar 19, 2021

benaadams commented Mar 29, 2021 • edited Loading

tannergooding commented Mar 29, 2021

tannergooding commented Jun 17, 2021

stephentoub commented Dec 26, 2023

benaadams commented Mar 19, 2021 •

edited

Loading

tannergooding commented Mar 19, 2021 •

edited

Loading

benaadams commented Mar 19, 2021 •

edited

Loading

benaadams commented Mar 29, 2021 •

edited

Loading