diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index b760301167..475d64bc4f 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -7,6 +7,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp { @@ -22,6 +23,20 @@ public static class HwIntrinsics private static ReadOnlySpan ShuffleMaskSlice4Nx16 => new byte[] { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80 }; + private static ReadOnlySpan ShuffleMaskShiftAlpha => + new byte[] + { + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15, + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15 + }; + + public static ReadOnlySpan PermuteMaskShiftAlpha8x32 => + new byte[] + { + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 + }; + /// /// Shuffle single-precision (32-bit) floating-point elements in /// using the control and store the results in . @@ -789,6 +804,138 @@ internal static void NormalizedFloatToByteSaturate( } } } + + internal static void PackFromRgbPlanesAvx2Reduce( + ref ReadOnlySpan redChannel, + ref ReadOnlySpan greenChannel, + ref ReadOnlySpan blueChannel, + ref Span destination) + { + ref Vector256 rBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(redChannel)); + ref Vector256 gBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(greenChannel)); + ref Vector256 bBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(blueChannel)); + ref byte dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(destination)); + + int count = redChannel.Length / Vector256.Count; + + ref byte control1Bytes = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskEvenOdd8x32); + Vector256 control1 = Unsafe.As>(ref control1Bytes); + + ref byte control2Bytes = ref MemoryMarshal.GetReference(PermuteMaskShiftAlpha8x32); + Vector256 control2 = Unsafe.As>(ref control2Bytes); + + Vector256 a = Vector256.Create((byte)255); + + Vector256 shuffleAlpha = Unsafe.As>(ref MemoryMarshal.GetReference(ShuffleMaskShiftAlpha)); + + for (int i = 0; i < count; i++) + { + Vector256 r0 = Unsafe.Add(ref rBase, i); + Vector256 g0 = Unsafe.Add(ref gBase, i); + Vector256 b0 = Unsafe.Add(ref bBase, i); + + r0 = Avx2.PermuteVar8x32(r0.AsUInt32(), control1).AsByte(); + g0 = Avx2.PermuteVar8x32(g0.AsUInt32(), control1).AsByte(); + b0 = Avx2.PermuteVar8x32(b0.AsUInt32(), control1).AsByte(); + + Vector256 rg = Avx2.UnpackLow(r0, g0); + Vector256 b1 = Avx2.UnpackLow(b0, a); + + Vector256 rgb1 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + Vector256 rgb2 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + + rg = Avx2.UnpackHigh(r0, g0); + b1 = Avx2.UnpackHigh(b0, a); + + Vector256 rgb3 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + Vector256 rgb4 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + + rgb1 = Avx2.Shuffle(rgb1, shuffleAlpha); + rgb2 = Avx2.Shuffle(rgb2, shuffleAlpha); + rgb3 = Avx2.Shuffle(rgb3, shuffleAlpha); + rgb4 = Avx2.Shuffle(rgb4, shuffleAlpha); + + rgb1 = Avx2.PermuteVar8x32(rgb1.AsUInt32(), control2).AsByte(); + rgb2 = Avx2.PermuteVar8x32(rgb2.AsUInt32(), control2).AsByte(); + rgb3 = Avx2.PermuteVar8x32(rgb3.AsUInt32(), control2).AsByte(); + rgb4 = Avx2.PermuteVar8x32(rgb4.AsUInt32(), control2).AsByte(); + + ref byte d1 = ref Unsafe.Add(ref dBase, 24 * 4 * i); + ref byte d2 = ref Unsafe.Add(ref d1, 24); + ref byte d3 = ref Unsafe.Add(ref d2, 24); + ref byte d4 = ref Unsafe.Add(ref d3, 24); + + Unsafe.As>(ref d1) = rgb1; + Unsafe.As>(ref d2) = rgb2; + Unsafe.As>(ref d3) = rgb3; + Unsafe.As>(ref d4) = rgb4; + } + + int slice = count * Vector256.Count; + redChannel = redChannel.Slice(slice); + greenChannel = greenChannel.Slice(slice); + blueChannel = blueChannel.Slice(slice); + destination = destination.Slice(slice); + } + + internal static void PackFromRgbPlanesAvx2Reduce( + ref ReadOnlySpan redChannel, + ref ReadOnlySpan greenChannel, + ref ReadOnlySpan blueChannel, + ref Span destination) + { + ref Vector256 rBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(redChannel)); + ref Vector256 gBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(greenChannel)); + ref Vector256 bBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(blueChannel)); + ref Vector256 dBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); + + int count = redChannel.Length / Vector256.Count; + + ref byte control1Bytes = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskEvenOdd8x32); + Vector256 control1 = Unsafe.As>(ref control1Bytes); + + ref byte control2Bytes = ref MemoryMarshal.GetReference(PermuteMaskShiftAlpha8x32); + Vector256 control2 = Unsafe.As>(ref control2Bytes); + + Vector256 a = Vector256.Create((byte)255); + + Vector256 shuffleAlpha = Unsafe.As>(ref MemoryMarshal.GetReference(ShuffleMaskShiftAlpha)); + + for (int i = 0; i < count; i++) + { + Vector256 r0 = Unsafe.Add(ref rBase, i); + Vector256 g0 = Unsafe.Add(ref gBase, i); + Vector256 b0 = Unsafe.Add(ref bBase, i); + + r0 = Avx2.PermuteVar8x32(r0.AsUInt32(), control1).AsByte(); + g0 = Avx2.PermuteVar8x32(g0.AsUInt32(), control1).AsByte(); + b0 = Avx2.PermuteVar8x32(b0.AsUInt32(), control1).AsByte(); + + Vector256 rg = Avx2.UnpackLow(r0, g0); + Vector256 b1 = Avx2.UnpackLow(b0, a); + + Vector256 rgb1 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + Vector256 rgb2 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + + rg = Avx2.UnpackHigh(r0, g0); + b1 = Avx2.UnpackHigh(b0, a); + + Vector256 rgb3 = Avx2.UnpackLow(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + Vector256 rgb4 = Avx2.UnpackHigh(rg.AsUInt16(), b1.AsUInt16()).AsByte(); + + ref Vector256 d0 = ref Unsafe.Add(ref dBase, i * 4); + d0 = rgb1; + Unsafe.Add(ref d0, 1) = rgb2; + Unsafe.Add(ref d0, 2) = rgb3; + Unsafe.Add(ref d0, 3) = rgb4; + } + + int slice = count * Vector256.Count; + redChannel = redChannel.Slice(slice); + greenChannel = greenChannel.Slice(slice); + blueChannel = blueChannel.Slice(slice); + destination = destination.Slice(slice); + } } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Pack.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Pack.cs new file mode 100644 index 0000000000..fe02bd0072 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Pack.cs @@ -0,0 +1,206 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using SixLabors.ImageSharp.PixelFormats; + +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + [MethodImpl(InliningOptions.ShortMethod)] + internal static void PackFromRgbPlanes( + Configuration configuration, + ReadOnlySpan redChannel, + ReadOnlySpan greenChannel, + ReadOnlySpan blueChannel, + Span destination) + { + DebugGuard.IsTrue(greenChannel.Length == redChannel.Length, nameof(greenChannel), "Channels must be of same size!"); + DebugGuard.IsTrue(blueChannel.Length == redChannel.Length, nameof(blueChannel), "Channels must be of same size!"); + DebugGuard.IsTrue(destination.Length > redChannel.Length + 2, nameof(destination), "'destination' must contain a padding of 3 elements!"); + +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination); + } + else +#endif + { + PackFromRgbPlanesScalarBatchedReduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination); + } + + PackFromRgbPlanesRemainder(redChannel, greenChannel, blueChannel, destination); + } + + [MethodImpl(InliningOptions.ShortMethod)] + internal static void PackFromRgbPlanes( + Configuration configuration, + ReadOnlySpan redChannel, + ReadOnlySpan greenChannel, + ReadOnlySpan blueChannel, + Span destination) + { + DebugGuard.IsTrue(greenChannel.Length == redChannel.Length, nameof(greenChannel), "Channels must be of same size!"); + DebugGuard.IsTrue(blueChannel.Length == redChannel.Length, nameof(blueChannel), "Channels must be of same size!"); + DebugGuard.IsTrue(destination.Length > redChannel.Length, nameof(destination), "'destination' span should not be shorter than the source channels!"); + +#if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination); + } + else +#endif + { + PackFromRgbPlanesScalarBatchedReduce(ref redChannel, ref greenChannel, ref blueChannel, ref destination); + } + + PackFromRgbPlanesRemainder(redChannel, greenChannel, blueChannel, destination); + } + + private static void PackFromRgbPlanesScalarBatchedReduce( + ref ReadOnlySpan redChannel, + ref ReadOnlySpan greenChannel, + ref ReadOnlySpan blueChannel, + ref Span destination) + { + ref ByteTuple4 r = ref Unsafe.As(ref MemoryMarshal.GetReference(redChannel)); + ref ByteTuple4 g = ref Unsafe.As(ref MemoryMarshal.GetReference(greenChannel)); + ref ByteTuple4 b = ref Unsafe.As(ref MemoryMarshal.GetReference(blueChannel)); + ref Rgb24 rgb = ref MemoryMarshal.GetReference(destination); + + int count = redChannel.Length / 4; + for (int i = 0; i < count; i++) + { + ref Rgb24 d0 = ref Unsafe.Add(ref rgb, i * 4); + ref Rgb24 d1 = ref Unsafe.Add(ref d0, 1); + ref Rgb24 d2 = ref Unsafe.Add(ref d0, 2); + ref Rgb24 d3 = ref Unsafe.Add(ref d0, 3); + + ref ByteTuple4 rr = ref Unsafe.Add(ref r, i); + ref ByteTuple4 gg = ref Unsafe.Add(ref g, i); + ref ByteTuple4 bb = ref Unsafe.Add(ref b, i); + + d0.R = rr.V0; + d0.G = gg.V0; + d0.B = bb.V0; + + d1.R = rr.V1; + d1.G = gg.V1; + d1.B = bb.V1; + + d2.R = rr.V2; + d2.G = gg.V2; + d2.B = bb.V2; + + d3.R = rr.V3; + d3.G = gg.V3; + d3.B = bb.V3; + } + + int finished = count * 4; + redChannel = redChannel.Slice(finished); + greenChannel = greenChannel.Slice(finished); + blueChannel = blueChannel.Slice(finished); + destination = destination.Slice(finished); + } + + private static void PackFromRgbPlanesScalarBatchedReduce( + ref ReadOnlySpan redChannel, + ref ReadOnlySpan greenChannel, + ref ReadOnlySpan blueChannel, + ref Span destination) + { + ref ByteTuple4 r = ref Unsafe.As(ref MemoryMarshal.GetReference(redChannel)); + ref ByteTuple4 g = ref Unsafe.As(ref MemoryMarshal.GetReference(greenChannel)); + ref ByteTuple4 b = ref Unsafe.As(ref MemoryMarshal.GetReference(blueChannel)); + ref Rgba32 rgb = ref MemoryMarshal.GetReference(destination); + + int count = redChannel.Length / 4; + destination.Fill(new Rgba32(0, 0, 0, 255)); + for (int i = 0; i < count; i++) + { + ref Rgba32 d0 = ref Unsafe.Add(ref rgb, i * 4); + ref Rgba32 d1 = ref Unsafe.Add(ref d0, 1); + ref Rgba32 d2 = ref Unsafe.Add(ref d0, 2); + ref Rgba32 d3 = ref Unsafe.Add(ref d0, 3); + + ref ByteTuple4 rr = ref Unsafe.Add(ref r, i); + ref ByteTuple4 gg = ref Unsafe.Add(ref g, i); + ref ByteTuple4 bb = ref Unsafe.Add(ref b, i); + + d0.R = rr.V0; + d0.G = gg.V0; + d0.B = bb.V0; + + d1.R = rr.V1; + d1.G = gg.V1; + d1.B = bb.V1; + + d2.R = rr.V2; + d2.G = gg.V2; + d2.B = bb.V2; + + d3.R = rr.V3; + d3.G = gg.V3; + d3.B = bb.V3; + } + + int finished = count * 4; + redChannel = redChannel.Slice(finished); + greenChannel = greenChannel.Slice(finished); + blueChannel = blueChannel.Slice(finished); + destination = destination.Slice(finished); + } + + private static void PackFromRgbPlanesRemainder( + ReadOnlySpan redChannel, + ReadOnlySpan greenChannel, + ReadOnlySpan blueChannel, + Span destination) + { + ref byte r = ref MemoryMarshal.GetReference(redChannel); + ref byte g = ref MemoryMarshal.GetReference(greenChannel); + ref byte b = ref MemoryMarshal.GetReference(blueChannel); + ref Rgb24 rgb = ref MemoryMarshal.GetReference(destination); + + for (int i = 0; i < destination.Length; i++) + { + ref Rgb24 d = ref Unsafe.Add(ref rgb, i); + d.R = Unsafe.Add(ref r, i); + d.G = Unsafe.Add(ref g, i); + d.B = Unsafe.Add(ref b, i); + } + } + + private static void PackFromRgbPlanesRemainder( + ReadOnlySpan redChannel, + ReadOnlySpan greenChannel, + ReadOnlySpan blueChannel, + Span destination) + { + ref byte r = ref MemoryMarshal.GetReference(redChannel); + ref byte g = ref MemoryMarshal.GetReference(greenChannel); + ref byte b = ref MemoryMarshal.GetReference(blueChannel); + ref Rgba32 rgba = ref MemoryMarshal.GetReference(destination); + + for (int i = 0; i < destination.Length; i++) + { + ref Rgba32 d = ref Unsafe.Add(ref rgba, i); + d.R = Unsafe.Add(ref r, i); + d.G = Unsafe.Add(ref g, i); + d.B = Unsafe.Add(ref b, i); + d.A = 255; + } + } + } +} \ No newline at end of file diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index aaf6d405cf..6d82cfad01 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -6,6 +6,7 @@ using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using SixLabors.ImageSharp.PixelFormats; #if SUPPORTS_RUNTIME_INTRINSICS using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; @@ -220,5 +221,13 @@ private static void VerifySpanInput(ReadOnlySpan source, Span dest, nameof(source), $"length should be divisible by {shouldBeDivisibleBy}!"); } + + private struct ByteTuple4 + { + public byte V0; + public byte V1; + public byte V2; + public byte V3; + } } } diff --git a/src/ImageSharp/ImageSharp.csproj b/src/ImageSharp/ImageSharp.csproj index 1d7fb2958b..a90aaf715a 100644 --- a/src/ImageSharp/ImageSharp.csproj +++ b/src/ImageSharp/ImageSharp.csproj @@ -24,16 +24,16 @@ - + - + - + - - + + diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgb24.PixelOperations.cs b/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgb24.PixelOperations.cs index 73b656f363..f345f58bcd 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgb24.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgb24.PixelOperations.cs @@ -21,6 +21,23 @@ internal partial class PixelOperations : PixelOperations /// public override PixelTypeInfo GetPixelTypeInfo() => LazyInfo.Value; + + /// + internal override void PackFromRgbPlanes( + Configuration configuration, + ReadOnlySpan redChannel, + ReadOnlySpan greenChannel, + ReadOnlySpan blueChannel, + Span destination) + { + Guard.NotNull(configuration, nameof(configuration)); + int count = redChannel.Length; + Guard.IsTrue(greenChannel.Length == count, nameof(greenChannel), "Channels must be of same size!"); + Guard.IsTrue(blueChannel.Length == count, nameof(blueChannel), "Channels must be of same size!"); + Guard.IsTrue(destination.Length > count + 2, nameof(destination), "'destination' must contain a padding of 3 elements!"); + + SimdUtils.PackFromRgbPlanes(configuration, redChannel, greenChannel, blueChannel, destination); + } } } } diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs index d8322e37d4..9633059774 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs @@ -56,6 +56,23 @@ public override void FromVector4Destructive( MemoryMarshal.Cast(sourceVectors), MemoryMarshal.Cast(destinationPixels)); } + + /// + internal override void PackFromRgbPlanes( + Configuration configuration, + ReadOnlySpan redChannel, + ReadOnlySpan greenChannel, + ReadOnlySpan blueChannel, + Span destination) + { + Guard.NotNull(configuration, nameof(configuration)); + int count = redChannel.Length; + Guard.IsTrue(greenChannel.Length == count, nameof(greenChannel), "Channels must be of same size!"); + Guard.IsTrue(blueChannel.Length == count, nameof(blueChannel), "Channels must be of same size!"); + Guard.IsTrue(destination.Length > count, nameof(destination), "'destination' span should not be shorter than the source channels!"); + + SimdUtils.PackFromRgbPlanes(configuration, redChannel, greenChannel, blueChannel, destination); + } } } } diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index dbe06702d9..c5450538e4 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -4,6 +4,8 @@ using System; using System.Buffers; using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using SixLabors.ImageSharp.Formats; using SixLabors.ImageSharp.Memory; @@ -159,5 +161,45 @@ public virtual void To( PixelOperations.Instance.From(configuration, sourcePixels, destinationPixels); } + + /// + /// Bulk operation that packs 3 seperate RGB channels to . + /// The destination must have a padding of 3. + /// + /// A to configure internal operations. + /// A to the red values. + /// A to the green values. + /// A to the blue values. + /// A to the destination pixels. + internal virtual void PackFromRgbPlanes( + Configuration configuration, + ReadOnlySpan redChannel, + ReadOnlySpan greenChannel, + ReadOnlySpan blueChannel, + Span destination) + { + Guard.NotNull(configuration, nameof(configuration)); + + int count = redChannel.Length; + Guard.IsTrue(greenChannel.Length == count, nameof(greenChannel), "Channels must be of same size!"); + Guard.IsTrue(blueChannel.Length == count, nameof(blueChannel), "Channels must be of same size!"); + Guard.IsTrue(destination.Length > count + 2, nameof(destination), "'destination' must contain a padding of 3 elements!"); + + Guard.DestinationShouldNotBeTooShort(redChannel, destination, nameof(destination)); + + Rgb24 rgb24 = default; + ref byte r = ref MemoryMarshal.GetReference(redChannel); + ref byte g = ref MemoryMarshal.GetReference(greenChannel); + ref byte b = ref MemoryMarshal.GetReference(blueChannel); + ref TPixel d = ref MemoryMarshal.GetReference(destination); + + for (int i = 0; i < count; i++) + { + rgb24.R = Unsafe.Add(ref r, i); + rgb24.G = Unsafe.Add(ref g, i); + rgb24.B = Unsafe.Add(ref b, i); + Unsafe.Add(ref d, i).FromRgb24(rgb24); + } + } } } diff --git a/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_PackFromRgbPlanes.cs b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_PackFromRgbPlanes.cs new file mode 100644 index 0000000000..eade8e0c43 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_PackFromRgbPlanes.cs @@ -0,0 +1,286 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.PixelFormats; + +namespace SixLabors.ImageSharp.Benchmarks.General.PixelConversion +{ + public unsafe class PixelConversion_PackFromRgbPlanes + { + private byte[] rBuf; + private byte[] gBuf; + private byte[] bBuf; + private Rgb24[] rgbBuf; + private Rgba32[] rgbaBuf; + + private float[] rFloat; + private float[] gFloat; + private float[] bFloat; + + private float[] rgbaFloat; + + [Params(1024)] + public int Count { get; set; } + + [GlobalSetup] + public void Setup() + { + this.rBuf = new byte[this.Count]; + this.gBuf = new byte[this.Count]; + this.bBuf = new byte[this.Count]; + this.rgbBuf = new Rgb24[this.Count + 3]; // padded + this.rgbaBuf = new Rgba32[this.Count]; + + this.rFloat = new float[this.Count]; + this.gFloat = new float[this.Count]; + this.bFloat = new float[this.Count]; + + this.rgbaFloat = new float[this.Count * 4]; + } + + // [Benchmark] + public void Rgb24_Scalar_PerElement_Pinned() + { + fixed (byte* r = &this.rBuf[0]) + fixed (byte* g = &this.gBuf[0]) + fixed (byte* b = &this.bBuf[0]) + fixed (Rgb24* rgb = &this.rgbBuf[0]) + { + for (int i = 0; i < this.Count; i++) + { + Rgb24* d = rgb + i; + d->R = r[i]; + d->G = g[i]; + d->B = b[i]; + } + } + } + + [Benchmark] + public void Rgb24_Scalar_PerElement_Span() + { + Span r = this.rBuf; + Span g = this.rBuf; + Span b = this.rBuf; + Span rgb = this.rgbBuf; + + for (int i = 0; i < r.Length; i++) + { + ref Rgb24 d = ref rgb[i]; + d.R = r[i]; + d.G = g[i]; + d.B = b[i]; + } + } + + [Benchmark] + public void Rgb24_Scalar_PerElement_Unsafe() + { + ref byte r = ref this.rBuf[0]; + ref byte g = ref this.rBuf[0]; + ref byte b = ref this.rBuf[0]; + ref Rgb24 rgb = ref this.rgbBuf[0]; + + for (int i = 0; i < this.Count; i++) + { + ref Rgb24 d = ref Unsafe.Add(ref rgb, i); + d.R = Unsafe.Add(ref r, i); + d.G = Unsafe.Add(ref g, i); + d.B = Unsafe.Add(ref b, i); + } + } + + [Benchmark] + public void Rgb24_Scalar_PerElement_Batched8() + { + ref Byte8 r = ref Unsafe.As(ref this.rBuf[0]); + ref Byte8 g = ref Unsafe.As(ref this.rBuf[0]); + ref Byte8 b = ref Unsafe.As(ref this.rBuf[0]); + ref Rgb24 rgb = ref this.rgbBuf[0]; + + int count = this.Count / 8; + for (int i = 0; i < count; i++) + { + ref Rgb24 d0 = ref Unsafe.Add(ref rgb, i * 8); + ref Rgb24 d1 = ref Unsafe.Add(ref d0, 1); + ref Rgb24 d2 = ref Unsafe.Add(ref d0, 2); + ref Rgb24 d3 = ref Unsafe.Add(ref d0, 3); + ref Rgb24 d4 = ref Unsafe.Add(ref d0, 4); + ref Rgb24 d5 = ref Unsafe.Add(ref d0, 5); + ref Rgb24 d6 = ref Unsafe.Add(ref d0, 6); + ref Rgb24 d7 = ref Unsafe.Add(ref d0, 7); + + ref Byte8 rr = ref Unsafe.Add(ref r, i); + ref Byte8 gg = ref Unsafe.Add(ref g, i); + ref Byte8 bb = ref Unsafe.Add(ref b, i); + + d0.R = rr.V0; + d0.G = gg.V0; + d0.B = bb.V0; + + d1.R = rr.V1; + d1.G = gg.V1; + d1.B = bb.V1; + + d2.R = rr.V2; + d2.G = gg.V2; + d2.B = bb.V2; + + d3.R = rr.V3; + d3.G = gg.V3; + d3.B = bb.V3; + + d4.R = rr.V4; + d4.G = gg.V4; + d4.B = bb.V4; + + d5.R = rr.V5; + d5.G = gg.V5; + d5.B = bb.V5; + + d6.R = rr.V6; + d6.G = gg.V6; + d6.B = bb.V6; + + d7.R = rr.V7; + d7.G = gg.V7; + d7.B = bb.V7; + } + } + + [Benchmark] + public void Rgb24_Scalar_PerElement_Batched4() + { + ref Byte4 r = ref Unsafe.As(ref this.rBuf[0]); + ref Byte4 g = ref Unsafe.As(ref this.rBuf[0]); + ref Byte4 b = ref Unsafe.As(ref this.rBuf[0]); + ref Rgb24 rgb = ref this.rgbBuf[0]; + + int count = this.Count / 4; + for (int i = 0; i < count; i++) + { + ref Rgb24 d0 = ref Unsafe.Add(ref rgb, i * 4); + ref Rgb24 d1 = ref Unsafe.Add(ref d0, 1); + ref Rgb24 d2 = ref Unsafe.Add(ref d0, 2); + ref Rgb24 d3 = ref Unsafe.Add(ref d0, 3); + + ref Byte4 rr = ref Unsafe.Add(ref r, i); + ref Byte4 gg = ref Unsafe.Add(ref g, i); + ref Byte4 bb = ref Unsafe.Add(ref b, i); + + d0.R = rr.V0; + d0.G = gg.V0; + d0.B = bb.V0; + + d1.R = rr.V1; + d1.G = gg.V1; + d1.B = bb.V1; + + d2.R = rr.V2; + d2.G = gg.V2; + d2.B = bb.V2; + + d3.R = rr.V3; + d3.G = gg.V3; + d3.B = bb.V3; + } + } + +#if SUPPORTS_RUNTIME_INTRINSICS + [Benchmark(Baseline = true)] + public void Rgba32_Avx2_Float() + { + ref Vector256 rBase = ref Unsafe.As>(ref this.rFloat[0]); + ref Vector256 gBase = ref Unsafe.As>(ref this.gFloat[0]); + ref Vector256 bBase = ref Unsafe.As>(ref this.bFloat[0]); + ref Vector256 resultBase = ref Unsafe.As>(ref this.rgbaFloat[0]); + + int count = this.Count / Vector256.Count; + + ref byte control = ref MemoryMarshal.GetReference(SimdUtils.HwIntrinsics.PermuteMaskEvenOdd8x32); + Vector256 vcontrol = Unsafe.As>(ref control); + + var va = Vector256.Create(1F); + + for (int i = 0; i < count; i++) + { + Vector256 r = Unsafe.Add(ref rBase, i); + Vector256 g = Unsafe.Add(ref gBase, i); + Vector256 b = Unsafe.Add(ref bBase, i); + + r = Avx2.PermuteVar8x32(r, vcontrol); + g = Avx2.PermuteVar8x32(g, vcontrol); + b = Avx2.PermuteVar8x32(b, vcontrol); + + Vector256 vte = Avx.UnpackLow(r, b); + Vector256 vto = Avx.UnpackLow(g, va); + + ref Vector256 destination = ref Unsafe.Add(ref resultBase, i * 4); + + destination = Avx.UnpackLow(vte, vto); + Unsafe.Add(ref destination, 1) = Avx.UnpackHigh(vte, vto); + + vte = Avx.UnpackHigh(r, b); + vto = Avx.UnpackHigh(g, va); + + Unsafe.Add(ref destination, 2) = Avx.UnpackLow(vte, vto); + Unsafe.Add(ref destination, 3) = Avx.UnpackHigh(vte, vto); + } + } + + [Benchmark] + public void Rgb24_Avx2_Bytes() + { + ReadOnlySpan r = this.rBuf; + ReadOnlySpan g = this.rBuf; + ReadOnlySpan b = this.rBuf; + Span rgb = this.rgbBuf; + SimdUtils.HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref r, ref g, ref b, ref rgb); + } + + [Benchmark] + public void Rgba32_Avx2_Bytes() + { + ReadOnlySpan r = this.rBuf; + ReadOnlySpan g = this.rBuf; + ReadOnlySpan b = this.rBuf; + Span rgb = this.rgbaBuf; + SimdUtils.HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref r, ref g, ref b, ref rgb); + } +#endif + +#pragma warning disable SA1132 + private struct Byte8 + { + public byte V0, V1, V2, V3, V4, V5, V6, V7; + } + + private struct Byte4 + { + public byte V0, V1, V2, V3; + } +#pragma warning restore + + // Results @ Anton's PC, 2020 Dec 05 + // .NET Core 3.1.1 + // Intel Core i7-7700HQ CPU 2.80GHz (Kaby Lake), 1 CPU, 8 logical and 4 physical cores + // + // | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | + // |--------------------------------- |------ |-----------:|---------:|---------:|------:|--------:| + // | Rgb24_Scalar_PerElement_Span | 1024 | 1,634.6 ns | 26.56 ns | 24.84 ns | 3.12 | 0.05 | + // | Rgb24_Scalar_PerElement_Unsafe | 1024 | 1,284.7 ns | 4.70 ns | 4.16 ns | 2.46 | 0.01 | + // | Rgb24_Scalar_PerElement_Batched8 | 1024 | 1,182.3 ns | 5.12 ns | 4.27 ns | 2.26 | 0.01 | + // | Rgb24_Scalar_PerElement_Batched4 | 1024 | 1,146.2 ns | 16.38 ns | 14.52 ns | 2.19 | 0.02 | + // | Rgba32_Avx2_Float | 1024 | 522.7 ns | 1.78 ns | 1.39 ns | 1.00 | 0.00 | + // | Rgb24_Avx2_Bytes | 1024 | 243.3 ns | 1.56 ns | 1.30 ns | 0.47 | 0.00 | + // | Rgba32_Avx2_Bytes | 1024 | 146.0 ns | 2.48 ns | 2.32 ns | 0.28 | 0.01 | + } +} \ No newline at end of file diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index ec09e43e57..1f680aa6cc 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -5,8 +5,10 @@ using System.Linq; using System.Numerics; using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; -using SixLabors.ImageSharp.Common.Tuples; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics.X86; +#endif +using SixLabors.ImageSharp.PixelFormats; using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using Xunit.Abstractions; @@ -169,7 +171,7 @@ public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedDa public static readonly TheoryData ArbitraryArraySizes = new TheoryData { - 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520, + 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, }; [Theory] @@ -336,90 +338,135 @@ public void BulkConvertNormalizedFloatToByteClampOverflows(int count) } } - private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( - int count, - Action, - Memory> convert, - int seed = -1) + [Theory] + [MemberData(nameof(ArbitraryArraySizes))] + public void PackFromRgbPlanes_Rgb24(int count) { - seed = seed > 0 ? seed : count; - float[] source = new Random(seed).GenerateRandomFloatArray(count, -0.2f, 1.2f); - byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); - var actual = new byte[count]; - - convert(source, actual); - - Assert.Equal(expected, actual); + TestPackFromRgbPlanes( + count, + (r, g, b, actual) => + SimdUtils.PackFromRgbPlanes(Configuration.Default, r, g, b, actual)); } - private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, (f * 255f) + 0.5f)); - [Theory] - [InlineData(0)] - [InlineData(7)] - [InlineData(42)] - [InlineData(255)] - [InlineData(256)] - [InlineData(257)] - private void MagicConvertToByte(float value) + [MemberData(nameof(ArbitraryArraySizes))] + public void PackFromRgbPlanes_Rgba32(int count) { - byte actual = MagicConvert(value / 256f); - var expected = (byte)value; - - Assert.Equal(expected, actual); + TestPackFromRgbPlanes( + count, + (r, g, b, actual) => + SimdUtils.PackFromRgbPlanes(Configuration.Default, r, g, b, actual)); } +#if SUPPORTS_RUNTIME_INTRINSICS [Fact] - private void BulkConvertNormalizedFloatToByte_Step() + public void PackFromRgbPlanesAvx2Reduce_Rgb24() { - if (this.SkipOnNonAvx2()) + if (!Avx2.IsSupported) { return; } - float[] source = { 0, 7, 42, 255, 0.5f, 1.1f, 2.6f, 16f }; + byte[] r = Enumerable.Range(0, 32).Select(x => (byte)x).ToArray(); + byte[] g = Enumerable.Range(100, 32).Select(x => (byte)x).ToArray(); + byte[] b = Enumerable.Range(200, 32).Select(x => (byte)x).ToArray(); + const int padding = 4; + Rgb24[] d = new Rgb24[32 + padding]; - byte[] expected = source.Select(f => (byte)Math.Round(f)).ToArray(); + ReadOnlySpan rr = r.AsSpan(); + ReadOnlySpan gg = g.AsSpan(); + ReadOnlySpan bb = b.AsSpan(); + Span dd = d.AsSpan(); - source = source.Select(f => f / 255f).ToArray(); + SimdUtils.HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref rr, ref gg, ref bb, ref dd); - Span dest = stackalloc byte[8]; - - this.MagicConvert(source, dest); + for (int i = 0; i < 32; i++) + { + Assert.Equal(i, d[i].R); + Assert.Equal(i + 100, d[i].G); + Assert.Equal(i + 200, d[i].B); + } - Assert.True(dest.SequenceEqual(expected)); + Assert.Equal(0, rr.Length); + Assert.Equal(0, gg.Length); + Assert.Equal(0, bb.Length); + Assert.Equal(padding, dd.Length); } - private static byte MagicConvert(float x) + [Fact] + public void PackFromRgbPlanesAvx2Reduce_Rgba32() { - float f = 32768.0f + x; - uint i = Unsafe.As(ref f); - return (byte)i; - } + if (!Avx2.IsSupported) + { + return; + } - private void MagicConvert(Span source, Span dest) - { - var magick = new Vector(32768.0f); + byte[] r = Enumerable.Range(0, 32).Select(x => (byte)x).ToArray(); + byte[] g = Enumerable.Range(100, 32).Select(x => (byte)x).ToArray(); + byte[] b = Enumerable.Range(200, 32).Select(x => (byte)x).ToArray(); - var scale = new Vector(255f) / new Vector(256f); + Rgba32[] d = new Rgba32[32]; - Vector x = MemoryMarshal.Cast>(source)[0]; + ReadOnlySpan rr = r.AsSpan(); + ReadOnlySpan gg = g.AsSpan(); + ReadOnlySpan bb = b.AsSpan(); + Span dd = d.AsSpan(); + + SimdUtils.HwIntrinsics.PackFromRgbPlanesAvx2Reduce(ref rr, ref gg, ref bb, ref dd); + + for (int i = 0; i < 32; i++) + { + Assert.Equal(i, d[i].R); + Assert.Equal(i + 100, d[i].G); + Assert.Equal(i + 200, d[i].B); + Assert.Equal(255, d[i].A); + } - x = (x * scale) + magick; + Assert.Equal(0, rr.Length); + Assert.Equal(0, gg.Length); + Assert.Equal(0, bb.Length); + Assert.Equal(0, dd.Length); + } +#endif + + internal static void TestPackFromRgbPlanes(int count, Action packMethod) + where TPixel : unmanaged, IPixel + { + Random rnd = new Random(42); + byte[] r = rnd.GenerateRandomByteArray(count); + byte[] g = rnd.GenerateRandomByteArray(count); + byte[] b = rnd.GenerateRandomByteArray(count); + + TPixel[] expected = new TPixel[count]; + for (int i = 0; i < count; i++) + { + expected[i].FromRgb24(new Rgb24(r[i], g[i], b[i])); + } - Tuple8.OfUInt32 ii = default; + TPixel[] actual = new TPixel[count + 3]; // padding for Rgb24 AVX2 + packMethod(r, g, b, actual); - ref Vector iiRef = ref Unsafe.As>(ref ii); + Assert.True(expected.AsSpan().SequenceEqual(actual.AsSpan().Slice(0, count))); + } - iiRef = x; + private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( + int count, + Action, + Memory> convert, + int seed = -1) + { + seed = seed > 0 ? seed : count; + float[] source = new Random(seed).GenerateRandomFloatArray(count, -0.2f, 1.2f); + byte[] expected = source.Select(NormalizedFloatToByte).ToArray(); + var actual = new byte[count]; - ref Tuple8.OfByte d = ref MemoryMarshal.Cast(dest)[0]; - d.LoadFrom(ref ii); + convert(source, actual); - this.Output.WriteLine(ii.ToString()); - this.Output.WriteLine(d.ToString()); + Assert.Equal(expected, actual); } + private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, (f * 255f) + 0.5f)); + private static void AssertEvenRoundIsCorrect(Vector r, Vector v) { for (int i = 0; i < Vector.Count; i++) diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperations/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperations/PixelOperationsTests.cs index 8d74ccec40..39786a2177 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperations/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperations/PixelOperationsTests.cs @@ -10,6 +10,7 @@ using SixLabors.ImageSharp.ColorSpaces.Companding; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Tests.Common; using SixLabors.ImageSharp.Tests.TestUtilities; using Xunit; using Xunit.Abstractions; @@ -1002,6 +1003,19 @@ public void ToRgba64Bytes(int count) (s, d) => this.Operations.ToRgba64Bytes(this.Configuration, s, d.GetSpan(), count)); } + [Theory] + [MemberData(nameof(ArraySizesData))] + public void PackFromRgbPlanes(int count) + { + SimdUtilsTests.TestPackFromRgbPlanes( + count, + ( + r, + g, + b, + actual) => PixelOperations.Instance.PackFromRgbPlanes(this.Configuration, r, g, b, actual)); + } + public delegate void RefAction(ref T1 arg1); internal static Vector4[] CreateExpectedVector4Data(TPixel[] source, RefAction vectorModifier = null) @@ -1102,10 +1116,10 @@ internal static TPixel[] CreateScaledPixelTestData(int length, RefAction