Skip to content

Commit

Permalink
Using SSE2 Vector128 for RLE and data copy
Browse files Browse the repository at this point in the history
  • Loading branch information
neon-nyan committed Aug 13, 2023
1 parent ac47f40 commit 2715d9a
Showing 1 changed file with 21 additions and 15 deletions.
36 changes: 21 additions & 15 deletions Hi3Helper.SharpHDiffPatch/PatchCore/PatchCore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
using System.Collections.Generic;
using System.IO;
using System.IO.Compression;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using ZstdNet;

namespace Hi3Helper.SharpHDiffPatch
Expand Down Expand Up @@ -40,7 +41,6 @@ internal class PatchCore
{
private const int _kSignTagBit = 1;
private const int _kByteRleType = 2;
private static int vecByteSize = Vector.IsHardwareAccelerated ? Vector<byte>.Count : 0;

internal static Stream GetBufferStreamFromOffset(CompressionMode compMode, Stream sourceStream,
long start, long length, long compLength, out long outLength, bool isBuffered)
Expand Down Expand Up @@ -320,7 +320,7 @@ private static void _TBytesRle_load_stream_decode_add(ref RLERefClipStruct rleLo
}
}

private static void _TBytesRle_load_stream_mem_add(ref RLERefClipStruct rleLoader, Stream outCache, ref long copyLength)
private static unsafe void _TBytesRle_load_stream_mem_add(ref RLERefClipStruct rleLoader, Stream outCache, ref long copyLength)
{
if (rleLoader.memSetLength != 0)
{
Expand All @@ -346,29 +346,35 @@ private static void _TBytesRle_load_stream_mem_add(ref RLERefClipStruct rleLoade
rleLoader.memSetLength -= memSetStep;
}

long decodeStep = rleLoader.memCopyLength > copyLength ? copyLength : rleLoader.memCopyLength;
int decodeStep = (int)(rleLoader.memCopyLength > copyLength ? copyLength : rleLoader.memCopyLength);
if (decodeStep == 0) return;

byte[] rleData = new byte[decodeStep];
byte[] oldData = new byte[decodeStep];
Span<byte> rleData = stackalloc byte[decodeStep];
Span<byte> oldData = stackalloc byte[decodeStep];
rleLoader.rleCodeClip.BaseStream.Read(rleData);

long lastPosCopy = outCache.Position;
outCache.Read(oldData);
outCache.Position = lastPosCopy;

long remaining = decodeStep % vecByteSize;
int iCopy;
for (iCopy = 0; iCopy < decodeStep - remaining; iCopy += vecByteSize)
fixed (byte* rlePtr = rleData)
fixed (byte* oldPtr = oldData)
{
var _srcVec = new Vector<byte>(oldData, iCopy);
var _toVec = new Vector<byte>(rleData, iCopy);
Vector.Add(_srcVec, _toVec).CopyTo(rleData, iCopy);
}
int offset;
long offsetRemained = decodeStep % Vector128<byte>.Count;
for (offset = 0; offset < decodeStep - offsetRemained; offset += Vector128<byte>.Count)
{
Vector128<byte> rleVector = Sse2.LoadVector128(rlePtr + offset);
Vector128<byte> oldVector = Sse2.LoadVector128(oldPtr + offset);
Vector128<byte> resultVector = Sse2.Add(rleVector, oldVector);

while (iCopy < decodeStep) rleData[iCopy] += oldData[iCopy++];
Sse2.Store(rlePtr + offset, resultVector);
}

outCache.Write(rleData);
while (offset < decodeStep) rleData[offset] += oldData[offset++];

outCache.Write(rleData);
}

rleLoader.memCopyLength -= decodeStep;
copyLength -= decodeStep;
Expand Down

0 comments on commit 2715d9a

Please sign in to comment.