Skip to content

Commit

Permalink
Improve .NET 8 stream decompression perf (#84)
Browse files Browse the repository at this point in the history
Motivation
----------
.NET 8 has shown a significant regression when decompressing streams,
though not blocks.

Modifications
-------------
Drop some overly aggressive inlining on cold paths, which may provide
better enregistration. Also, for .NET 8 use an inline array to avoid
allocating the scratch buffer on the heap.

Results
-------
The regression has been effectively eliminated.

BenchmarkDotNet v0.13.10, Windows 11
(10.0.22621.2861/22H2/2022Update/SunValley2) 12th Gen Intel Core
i7-1270P, 1 CPU, 16 logical and 12 physical cores .NET SDK 8.0.100
  [Host]     : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2
Job-EHUKKX : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT
VectorSize=256
Job-SIHOZS : .NET Framework 4.8.1 (4.8.9181.0), X64 RyuJIT
VectorSize=256
  Job-VAGBBP : .NET 6.0.25 (6.0.2523.51912), X64 RyuJIT AVX2
  Job-ZACCID : .NET 6.0.25 (6.0.2523.51912), X64 RyuJIT AVX2
  Job-ACXPKE : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2
  Job-JFCQYT : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2
  Job-CWWXAC : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2
  Job-IDGNPX : .NET 8.0.0 (8.0.23.53103), X64 RyuJIT AVX2

| Method | Runtime | BuildConfiguration | PGO | Mean | Error | StdDev |
Ratio | Rank |
|----------- |------------------- |------------------- |----
|---------:|--------:|--------:|------:|-----:|
| Decompress | .NET Framework 4.8 | Previous | N | 342.7 us | 1.73 us |
1.45 us | 1.00 | 2 |
| Decompress | .NET Framework 4.8 | Default | N | 334.7 us | 0.85 us |
0.80 us | 0.98 | 1 |
| | | | | | | | | |
| Decompress | .NET 6.0 | Previous | N | 168.5 us | 1.43 us | 1.34 us |
1.00 | 2 |
| Decompress | .NET 6.0 | Default | N | 164.8 us | 0.78 us | 0.73 us |
0.98 | 1 |
| | | | | | | | | |
| Decompress | .NET 8.0 | Previous | N | 199.9 us | 1.64 us | 1.53 us |
1.00 | 2 |
| Decompress | .NET 8.0 | Default | N | 160.7 us | 1.42 us | 1.33 us |
0.80 | 1 |
| | | | | | | | | |
| Decompress | .NET 8.0 | Previous | Y | 191.3 us | 1.05 us | 0.98 us |
1.00 | 2 |
| Decompress | .NET 8.0 | Default | Y | 160.9 us | 1.50 us | 1.40 us |
0.84 | 1 |

Note: Benchmarks include other improvements since 1.1.3
  • Loading branch information
brantburnett authored Jan 6, 2024
1 parent 9d07b42 commit 77834c8
Showing 1 changed file with 25 additions and 10 deletions.
35 changes: 25 additions & 10 deletions Snappier/Internal/SnappyStreamDecompressor.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
using System;
using System.Buffers.Binary;
using System.Diagnostics;
using System.IO;
using System.Runtime.CompilerServices;

namespace Snappier.Internal
{
Expand All @@ -13,11 +11,30 @@ internal sealed class SnappyStreamDecompressor : IDisposable
{
private const int ScratchBufferSize = 4;

#if NET8_0_OR_GREATER
#pragma warning disable IDE0051
#pragma warning disable IDE0044
#pragma warning disable CS0649 // Field is never assigned to, and will always have its default value
[System.Runtime.CompilerServices.InlineArray(ScratchBufferSize)]
private struct ScratchBuffer
{
private byte _element0;
}

private ScratchBuffer _scratch;
#pragma warning restore CS0649 // Field is never assigned to, and will always have its default value
#pragma warning restore IDE0044
#pragma warning restore IDE0051
#else
private readonly byte[] _scratch = new byte[ScratchBufferSize];
#endif

private Span<byte> Scratch => _scratch;

private SnappyDecompressor? _decompressor = new();

private ReadOnlyMemory<byte> _input;

private readonly byte[] _scratch = new byte[ScratchBufferSize];
private int _scratchLength;
private Constants.ChunkType _chunkType = Constants.ChunkType.Null;
private int _chunkSize;
Expand Down Expand Up @@ -196,14 +213,13 @@ public void SetInput(ReadOnlyMemory<byte> input)
_input = input;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private uint ReadChunkHeader(ref ReadOnlySpan<byte> buffer)
{
if (_scratchLength > 0)
{
int bytesToCopyToScratch = 4 - _scratchLength;

Span<byte> scratch = _scratch.AsSpan();
Span<byte> scratch = Scratch;
buffer.Slice(0, bytesToCopyToScratch).CopyTo(scratch.Slice(_scratchLength));

buffer = buffer.Slice(bytesToCopyToScratch);
Expand All @@ -223,7 +239,7 @@ private uint ReadChunkHeader(ref ReadOnlySpan<byte> buffer)
{
// Insufficient data

buffer.CopyTo(_scratch);
buffer.CopyTo(Scratch);

_scratchLength = buffer.Length;
buffer = default;
Expand All @@ -240,9 +256,8 @@ private uint ReadChunkHeader(ref ReadOnlySpan<byte> buffer)

/// <summary>
/// Assuming that we're at the beginning of a chunk, reads the CRC. If partially read, stores the value in
/// _scratch for subsequent reads. Should not be called if chunkByteProcessed >= 4.
/// Scratch for subsequent reads. Should not be called if chunkByteProcessed >= 4.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private bool ReadChunkCrc(ref ReadOnlySpan<byte> input)
{
Debug.Assert(_chunkBytesProcessed < 4);
Expand All @@ -259,14 +274,14 @@ private bool ReadChunkCrc(ref ReadOnlySpan<byte> input)

// Copy to scratch
int crcBytesAvailable = Math.Min(input.Length, 4 - _chunkBytesProcessed);
input.Slice(0, crcBytesAvailable).CopyTo(_scratch.AsSpan(_scratchLength));
input.Slice(0, crcBytesAvailable).CopyTo(Scratch.Slice(_scratchLength));
_scratchLength += crcBytesAvailable;
input = input.Slice(crcBytesAvailable);
_chunkBytesProcessed += crcBytesAvailable;

if (_scratchLength >= 4)
{
_expectedChunkCrc = BinaryPrimitives.ReadUInt32LittleEndian(_scratch);
_expectedChunkCrc = BinaryPrimitives.ReadUInt32LittleEndian(Scratch);
_scratchLength = 0;
return true;
}
Expand Down

0 comments on commit 77834c8

Please sign in to comment.