Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement LFU sketch using arm64 intrinsics #595

Draft
wants to merge 11 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions BitFaster.Caching.Benchmarks/Lfu/SketchFrequency.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
namespace BitFaster.Caching.Benchmarks.Lfu
{
[SimpleJob(RuntimeMoniker.Net60)]
[SimpleJob(RuntimeMoniker.Net80)]
[MemoryDiagnoser(displayGenColumns: false)]
[HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")]
[ColumnChart(Title ="Sketch Frequency ({JOB})")]
Expand All @@ -22,7 +23,7 @@ public class SketchFrequency
private CmSketchCore<int, DisableHardwareIntrinsics> blockStd;
private CmSketchCore<int, DetectIsa> blockAvx;

[Params(32_768, 524_288, 8_388_608, 134_217_728)]
[Params(1024, 32_768, 524_288, 8_388_608, 134_217_728)]
public int Size { get; set; }

[GlobalSetup]
Expand All @@ -45,7 +46,7 @@ public int FrequencyFlat()
return count;
}

[Benchmark(OperationsPerInvoke = iterations)]
// [Benchmark(OperationsPerInvoke = iterations)]
public int FrequencyFlatAvx()
{
int count = 0;
Expand Down
7 changes: 5 additions & 2 deletions BitFaster.Caching.Benchmarks/Lfu/SketchIncrement.cs
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@

using System.Collections.Generic;
using Benchly;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Jobs;
using BitFaster.Caching.Lfu;

namespace BitFaster.Caching.Benchmarks.Lfu
{
[SimpleJob(RuntimeMoniker.Net60)]
[SimpleJob(RuntimeMoniker.Net80)]
[MemoryDiagnoser(displayGenColumns: false)]
[HideColumns("Job", "Median", "RatioSD", "Alloc Ratio")]
[ColumnChart(Title = "Sketch Increment ({JOB})")]
public class SketchIncrement
{
const int iterations = 1_048_576;
Expand All @@ -19,7 +22,7 @@ public class SketchIncrement
private CmSketchCore<int, DisableHardwareIntrinsics> blockStd;
private CmSketchCore<int, DetectIsa> blockAvx;

[Params(32_768, 524_288, 8_388_608, 134_217_728)]
[Params(1024, 32_768, 524_288, 8_388_608, 134_217_728)]
public int Size { get; set; }

[GlobalSetup]
Expand All @@ -41,7 +44,7 @@ public void IncFlat()
}
}

[Benchmark(OperationsPerInvoke = iterations)]
//[Benchmark(OperationsPerInvoke = iterations)]
public void IncFlatAvx()
{
for (int i = 0; i < iterations; i++)
Expand Down
10 changes: 10 additions & 0 deletions BitFaster.Caching.UnitTests/Intrinsics.cs
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#if NETCOREAPP3_1_OR_GREATER
using System.Runtime.Intrinsics.X86;
#endif
#if NET6_0_OR_GREATER
using System.Runtime.Intrinsics.Arm;
#endif

using Xunit;

namespace BitFaster.Caching.UnitTests
Expand All @@ -10,8 +14,14 @@ public static class Intrinsics
public static void SkipAvxIfNotSupported<I>()
{
#if NETCOREAPP3_1_OR_GREATER
#if NET6_0_OR_GREATER
// when we are trying to test Avx2/Arm64, skip the test if it's not supported
Skip.If(typeof(I) == typeof(DetectIsa) && !(Avx2.IsSupported || AdvSimd.Arm64.IsSupported));
#else
// when we are trying to test Avx2, skip the test if it's not supported
Skip.If(typeof(I) == typeof(DetectIsa) && !Avx2.IsSupported);
#endif

#else
Skip.If(true);
#endif
Expand Down
15 changes: 12 additions & 3 deletions BitFaster.Caching.UnitTests/Lfu/CmSketchTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@

namespace BitFaster.Caching.UnitTests.Lfu
{
// Test with AVX2 if it is supported
public class CMSketchAvx2Tests : CmSketchTestBase<DetectIsa>
// Test with AVX2/ARM64 if it is supported
public class CMSketchIntrinsicsTests : CmSketchTestBase<DetectIsa>
{
}

// Test with AVX2 disabled
// Test with AVX2/ARM64 disabled
public class CmSketchTests : CmSketchTestBase<DisableHardwareIntrinsics>
{
}
Expand All @@ -29,14 +29,23 @@ public CmSketchTestBase()
public void Repro()
{
sketch = new CmSketchCore<int, I>(1_048_576, EqualityComparer<int>.Default);
var baseline = new CmSketchCore<int, DisableHardwareIntrinsics>(1_048_576, EqualityComparer<int>.Default);

for (int i = 0; i < 1_048_576; i++)
{
if (i % 3 == 0)
{
sketch.Increment(i);
baseline.Increment(i);
}
}

baseline.Size.Should().Be(sketch.Size);

for (int i = 0; i < 1_048_576; i++)
{
sketch.EstimateFrequency(i).Should().Be(baseline.EstimateFrequency(i));
}
}


Expand Down
28 changes: 25 additions & 3 deletions BitFaster.Caching/Intrinsics.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
using System.Runtime.Intrinsics.X86;
#endif

#if NET6_0
using System.Runtime.Intrinsics.Arm;
#endif

namespace BitFaster.Caching
{
/// <summary>
Expand All @@ -12,7 +16,14 @@ public interface IsaProbe
/// <summary>
/// Gets a value indicating whether AVX2 is supported.
/// </summary>
bool IsAvx2Supported { get; }
bool IsAvx2Supported { get; }

#if NET6_0_OR_GREATER
/// <summary>
/// Gets a value indicating whether Arm64 is supported.
/// </summary>
bool IsArm64Supported { get => false; }
#endif
}

/// <summary>
Expand All @@ -25,7 +36,15 @@ public interface IsaProbe
public bool IsAvx2Supported => false;
#else
/// <inheritdoc/>
public bool IsAvx2Supported => Avx2.IsSupported;
public bool IsAvx2Supported => Avx2.IsSupported;
#endif

#if NET6_0_OR_GREATER
/// <inheritdoc/>
public bool IsArm64Supported => AdvSimd.Arm64.IsSupported;
#else
/// <inheritdoc/>
public bool IsArm64Supported => false;
#endif
}

Expand All @@ -35,6 +54,9 @@ public interface IsaProbe
public readonly struct DisableHardwareIntrinsics : IsaProbe
{
/// <inheritdoc/>
public bool IsAvx2Supported => false;
public bool IsAvx2Supported => false;

/// <inheritdoc/>
public bool IsArm64Supported => false;
}
}
106 changes: 106 additions & 0 deletions BitFaster.Caching/Lfu/CmSketchCore.cs
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;


#if !NETSTANDARD2_0
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
#endif

#if NET6_0_OR_GREATER
using System.Runtime.Intrinsics.Arm;
#endif

namespace BitFaster.Caching.Lfu
{
/// <summary>
Expand Down Expand Up @@ -76,6 +81,12 @@ public int EstimateFrequency(T value)
{
return EstimateFrequencyAvx(value);
}
#if NET6_0_OR_GREATER
else if (isa.IsArm64Supported)
{
return EstimateFrequencyArm(value);
}
#endif
else
{
return EstimateFrequencyStd(value);
Expand All @@ -99,6 +110,12 @@ public void Increment(T value)
{
IncrementAvx(value);
}
#if NET6_0_OR_GREATER
else if (isa.IsArm64Supported)
{
IncrementArm(value);
}
#endif
else
{
IncrementStd(value);
Expand Down Expand Up @@ -329,5 +346,94 @@ private unsafe void IncrementAvx(T value)
}
}
#endif

#if NET6_0_OR_GREATER
[MethodImpl(MethodImplOptions.AggressiveOptimization | MethodImplOptions.AggressiveInlining)]
private unsafe void IncrementArm(T value)
Dismissed Show dismissed Hide dismissed
{
int blockHash = Spread(comparer.GetHashCode(value));
int counterHash = Rehash(blockHash);
int block = (blockHash & blockMask) << 3;

Vector128<int> h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24));
Vector128<int> index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf));
Vector128<int> blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6));

fixed (long* tablePtr = table)
{
int t0 = AdvSimd.Extract(blockOffset, 0);
int t1 = AdvSimd.Extract(blockOffset, 1);
int t2 = AdvSimd.Extract(blockOffset, 2);
int t3 = AdvSimd.Extract(blockOffset, 3);

Vector128<long> tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t0), AdvSimd.LoadVector64(tablePtr + t1));
Vector128<long> tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + t2), AdvSimd.LoadVector64(tablePtr + t3));

index = AdvSimd.ShiftLeftLogicalSaturate(index, 2);

Vector128<int> longOffA = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128<int>.Zero, 0, index, 0), 2, index, 1);
Vector128<int> longOffB = AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128<int>.Zero, 0, index, 2), 2, index, 3);

Vector128<long> fifteen = Vector128.Create(0xfL);
Vector128<long> maskA = AdvSimd.ShiftArithmetic(fifteen, longOffA.AsInt64());
Vector128<long> maskB = AdvSimd.ShiftArithmetic(fifteen, longOffB.AsInt64());

Vector128<long> maskedA = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorA, maskA), maskA));
Vector128<long> maskedB = AdvSimd.Not(AdvSimd.Arm64.CompareEqual(AdvSimd.And(tableVectorB, maskB), maskB));

var one = Vector128.Create(1L);
Vector128<long> incA = AdvSimd.And(maskedA, AdvSimd.ShiftArithmetic(one, longOffA.AsInt64()));
Vector128<long> incB = AdvSimd.And(maskedB, AdvSimd.ShiftArithmetic(one, longOffB.AsInt64()));

tablePtr[t0] += AdvSimd.Extract(incA, 0);
tablePtr[t1] += AdvSimd.Extract(incA, 1);
tablePtr[t2] += AdvSimd.Extract(incB, 0);
tablePtr[t3] += AdvSimd.Extract(incB, 1);

var max = AdvSimd.Arm64.MaxAcross(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.MaxAcross(incA.AsInt32()), 1, AdvSimd.Arm64.MaxAcross(incB.AsInt32()), 0).AsInt16());

if (max.ToScalar() != 0 && (++size == sampleSize))
{
Reset();
}
}
}

[MethodImpl(MethodImplOptions.AggressiveOptimization | MethodImplOptions.AggressiveInlining)]
private unsafe int EstimateFrequencyArm(T value)
Dismissed Show dismissed Hide dismissed
{
int blockHash = Spread(comparer.GetHashCode(value));
int counterHash = Rehash(blockHash);
int block = (blockHash & blockMask) << 3;

Vector128<int> h = AdvSimd.ShiftArithmetic(Vector128.Create(counterHash), Vector128.Create(0, -8, -16, -24));
Vector128<int> index = AdvSimd.And(AdvSimd.ShiftRightLogical(h, 1), Vector128.Create(0xf));
Vector128<int> blockOffset = AdvSimd.Add(AdvSimd.Add(Vector128.Create(block), AdvSimd.And(h, Vector128.Create(1))), Vector128.Create(0, 2, 4, 6));

fixed (long* tablePtr = table)
{
Vector128<long> tableVectorA = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 0)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 1)));
Vector128<long> tableVectorB = Vector128.Create(AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 2)), AdvSimd.LoadVector64(tablePtr + AdvSimd.Extract(blockOffset, 3)));

index = AdvSimd.ShiftLeftLogicalSaturate(index, 2);

Vector128<int> indexA = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128<int>.Zero, 0, index, 0), 2, index, 1));
Vector128<int> indexB = AdvSimd.Negate(AdvSimd.Arm64.InsertSelectedScalar(AdvSimd.Arm64.InsertSelectedScalar(Vector128<int>.Zero, 0, index, 2), 2, index, 3));

var fifteen = Vector128.Create(0xfL);
Vector128<long> a = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorA, indexA.AsInt64()), fifteen);
Vector128<long> b = AdvSimd.And(AdvSimd.ShiftArithmetic(tableVectorB, indexB.AsInt64()), fifteen);

// Before: < 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, A, B, C, D, E, F >
// After: < 0, 1, 2, 3, 8, 9, A, B, 4, 5, 6, 7, C, D, E, F >
var min = AdvSimd.Arm64.VectorTableLookup(a.AsByte(), Vector128.Create(0x0B0A090803020100, 0xFFFFFFFFFFFFFFFF).AsByte());
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On later versions of .NET VectorTableLookup can also take two or more 128 bit registers as input.

See this example:
dotnet/runtime#87126

min = AdvSimd.Arm64.VectorTableLookupExtension(min, b.AsByte(), Vector128.Create(0xFFFFFFFFFFFFFFFF, 0x0B0A090803020100).AsByte());

var min32 = AdvSimd.Arm64.MinAcross(min.AsInt32());

return min32.ToScalar();
}
}
#endif
}
}
Loading