From 410dae2295bd1d15a0e65a42327956fad24eaf3a Mon Sep 17 00:00:00 2001 From: Alex Peck Date: Sun, 12 Jan 2025 17:46:24 -0800 Subject: [PATCH] Unroll sketch increment (#653) * unroll * bench * unroll freq * comment * rem extra file --------- --- .../Lfu/CmSketchNoPin.cs | 3 + .../Lfu/SketchFrequency.cs | 18 +++++- .../Lfu/SketchIncrement.cs | 17 ++++- BitFaster.Caching/BitFaster.Caching.csproj | 2 +- BitFaster.Caching/Lfu/CmSketchCore.cs | 63 ++++++++++++------- 5 files changed, 75 insertions(+), 28 deletions(-) diff --git a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs index 68428e06..fdc8629a 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/CmSketchNoPin.cs @@ -12,6 +12,9 @@ namespace BitFaster.Caching.Benchmarks.Lfu { + // Block sketch implementation without: + // - Pinned buffer for vector code paths + // - Loop unroll for non-vector code paths internal class CmSketchNoPin where T : notnull where I : struct, IsaProbe diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs index ba206ddf..970c2d9f 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs @@ -24,7 +24,8 @@ public class SketchFrequency private CmSketchFlat flatStd; private CmSketchFlat flatAvx; - private CmSketchCore blockStd; + private CmSketchNoPin blockStdNoUnroll; + private CmSketchCore blockStdUnroll; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; @@ -37,7 +38,8 @@ public void Setup() flatStd = new CmSketchFlat(Size, EqualityComparer.Default); flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); - blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockStdNoUnroll = new CmSketchNoPin(Size, EqualityComparer.Default); + blockStdUnroll = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -67,7 +69,17 @@ public int FrequencyBlock() { int count = 0; for (int i = 0; i < iterations; i++) - count += blockStd.EstimateFrequency(i) > blockStd.EstimateFrequency(i + 1) ? 1 : 0; + count += blockStdNoUnroll.EstimateFrequency(i) > blockStdNoUnroll.EstimateFrequency(i + 1) ? 1 : 0; + + return count; + } + + [Benchmark(OperationsPerInvoke = iterations)] + public int FrequencyBlockUnroll() + { + int count = 0; + for (int i = 0; i < iterations; i++) + count += blockStdUnroll.EstimateFrequency(i) > blockStdUnroll.EstimateFrequency(i + 1) ? 1 : 0; return count; } diff --git a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs index e2fb1e02..0f868ed1 100644 --- a/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs +++ b/BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs @@ -23,7 +23,8 @@ public class SketchIncrement private CmSketchFlat flatStd; private CmSketchFlat flatAvx; - private CmSketchCore blockStd; + private CmSketchNoPin blockStdNoUnroll; + private CmSketchCore blockStdUnroll; private CmSketchNoPin blockAvxNoPin; private CmSketchCore blockAvx; @@ -37,7 +38,8 @@ public void Setup() flatStd = new CmSketchFlat(Size, EqualityComparer.Default); flatAvx = new CmSketchFlat(Size, EqualityComparer.Default); - blockStd = new CmSketchCore(Size, EqualityComparer.Default); + blockStdNoUnroll = new CmSketchNoPin(Size, EqualityComparer.Default); + blockStdUnroll = new CmSketchCore(Size, EqualityComparer.Default); blockAvxNoPin = new CmSketchNoPin(Size, EqualityComparer.Default); blockAvx = new CmSketchCore(Size, EqualityComparer.Default); } @@ -65,7 +67,16 @@ public void IncBlock() { for (int i = 0; i < iterations; i++) { - blockStd.Increment(i); + blockStdNoUnroll.Increment(i); + } + } + + [Benchmark(OperationsPerInvoke = iterations)] + public void IncBlockUnroll() + { + for (int i = 0; i < iterations; i++) + { + blockStdUnroll.Increment(i); } } diff --git a/BitFaster.Caching/BitFaster.Caching.csproj b/BitFaster.Caching/BitFaster.Caching.csproj index 20405be8..4860e2e0 100644 --- a/BitFaster.Caching/BitFaster.Caching.csproj +++ b/BitFaster.Caching/BitFaster.Caching.csproj @@ -2,7 +2,7 @@ netstandard2.0;netcoreapp3.1;net6.0 - 10.0 + 11.0 Alex Peck BitFaster.Caching diff --git a/BitFaster.Caching/Lfu/CmSketchCore.cs b/BitFaster.Caching/Lfu/CmSketchCore.cs index 46f516b7..9764c51b 100644 --- a/BitFaster.Caching/Lfu/CmSketchCore.cs +++ b/BitFaster.Caching/Lfu/CmSketchCore.cs @@ -8,6 +8,7 @@ #if !NETSTANDARD2_0 using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; + #endif #if NET6_0_OR_GREATER @@ -169,41 +170,61 @@ private void EnsureCapacity(long maximumSize) private unsafe int EstimateFrequencyStd(T value) { - var count = stackalloc int[4]; int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - int index = (h >> 1) & 15; - int offset = h & 1; - count[i] = (int)(((ulong)table[block + offset + (i << 1)] >> (index << 2)) & 0xfL); - } - return Math.Min(Math.Min(count[0], count[1]), Math.Min(count[2], count[3])); + // Loop unrolling improves throughput + int h0 = counterHash; + int h1 = counterHash >>> 8; + int h2 = counterHash >>> 16; + int h3 = counterHash >>> 24; + + int index0 = (h0 >>> 1) & 15; + int index1 = (h1 >>> 1) & 15; + int index2 = (h2 >>> 1) & 15; + int index3 = (h3 >>> 1) & 15; + + int slot0 = block + (h0 & 1); + int slot1 = block + (h1 & 1) + 2; + int slot2 = block + (h2 & 1) + 4; + int slot3 = block + (h3 & 1) + 6; + + int count0 = (int)((table[slot0] >>> (index0 << 2)) & 0xfL); + int count1 = (int)((table[slot1] >>> (index1 << 2)) & 0xfL); + int count2 = (int)((table[slot2] >>> (index2 << 2)) & 0xfL); + int count3 = (int)((table[slot3] >>> (index3 << 2)) & 0xfL); + + return Math.Min(Math.Min(count0, count1), Math.Min(count2, count3)); } private unsafe void IncrementStd(T value) { - var index = stackalloc int[8]; int blockHash = Spread(comparer.GetHashCode(value)); int counterHash = Rehash(blockHash); int block = (blockHash & blockMask) << 3; - for (int i = 0; i < 4; i++) - { - int h = (int)((uint)counterHash >> (i << 3)); - index[i] = (h >> 1) & 15; - int offset = h & 1; - index[i + 4] = block + offset + (i << 1); - } + // Loop unrolling improves throughput + int h0 = counterHash; + int h1 = counterHash >>> 8; + int h2 = counterHash >>> 16; + int h3 = counterHash >>> 24; + + int index0 = (h0 >>> 1) & 15; + int index1 = (h1 >>> 1) & 15; + int index2 = (h2 >>> 1) & 15; + int index3 = (h3 >>> 1) & 15; + + int slot0 = block + (h0 & 1); + int slot1 = block + (h1 & 1) + 2; + int slot2 = block + (h2 & 1) + 4; + int slot3 = block + (h3 & 1) + 6; bool added = - IncrementAt(index[4], index[0]) - | IncrementAt(index[5], index[1]) - | IncrementAt(index[6], index[2]) - | IncrementAt(index[7], index[3]); + IncrementAt(slot0, index0) + | IncrementAt(slot1, index1) + | IncrementAt(slot2, index2) + | IncrementAt(slot3, index3); if (added && (++size == sampleSize)) {