From 2715d9a8bcfd0aae82ca64781824119ee200c4bb Mon Sep 17 00:00:00 2001 From: Kemal Setya Adhi Date: Sun, 13 Aug 2023 21:21:20 +0700 Subject: [PATCH] Using SSE2 Vector128 for RLE and data copy --- .../PatchCore/PatchCore.cs | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/Hi3Helper.SharpHDiffPatch/PatchCore/PatchCore.cs b/Hi3Helper.SharpHDiffPatch/PatchCore/PatchCore.cs index ac7e0c1..b570847 100644 --- a/Hi3Helper.SharpHDiffPatch/PatchCore/PatchCore.cs +++ b/Hi3Helper.SharpHDiffPatch/PatchCore/PatchCore.cs @@ -3,7 +3,8 @@ using System.Collections.Generic; using System.IO; using System.IO.Compression; -using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using ZstdNet; namespace Hi3Helper.SharpHDiffPatch @@ -40,7 +41,6 @@ internal class PatchCore { private const int _kSignTagBit = 1; private const int _kByteRleType = 2; - private static int vecByteSize = Vector.IsHardwareAccelerated ? Vector.Count : 0; internal static Stream GetBufferStreamFromOffset(CompressionMode compMode, Stream sourceStream, long start, long length, long compLength, out long outLength, bool isBuffered) @@ -320,7 +320,7 @@ private static void _TBytesRle_load_stream_decode_add(ref RLERefClipStruct rleLo } } - private static void _TBytesRle_load_stream_mem_add(ref RLERefClipStruct rleLoader, Stream outCache, ref long copyLength) + private static unsafe void _TBytesRle_load_stream_mem_add(ref RLERefClipStruct rleLoader, Stream outCache, ref long copyLength) { if (rleLoader.memSetLength != 0) { @@ -346,29 +346,35 @@ private static void _TBytesRle_load_stream_mem_add(ref RLERefClipStruct rleLoade rleLoader.memSetLength -= memSetStep; } - long decodeStep = rleLoader.memCopyLength > copyLength ? copyLength : rleLoader.memCopyLength; + int decodeStep = (int)(rleLoader.memCopyLength > copyLength ? copyLength : rleLoader.memCopyLength); if (decodeStep == 0) return; - byte[] rleData = new byte[decodeStep]; - byte[] oldData = new byte[decodeStep]; + Span rleData = stackalloc byte[decodeStep]; + Span oldData = stackalloc byte[decodeStep]; rleLoader.rleCodeClip.BaseStream.Read(rleData); long lastPosCopy = outCache.Position; outCache.Read(oldData); outCache.Position = lastPosCopy; - long remaining = decodeStep % vecByteSize; - int iCopy; - for (iCopy = 0; iCopy < decodeStep - remaining; iCopy += vecByteSize) + fixed (byte* rlePtr = rleData) + fixed (byte* oldPtr = oldData) { - var _srcVec = new Vector(oldData, iCopy); - var _toVec = new Vector(rleData, iCopy); - Vector.Add(_srcVec, _toVec).CopyTo(rleData, iCopy); - } + int offset; + long offsetRemained = decodeStep % Vector128.Count; + for (offset = 0; offset < decodeStep - offsetRemained; offset += Vector128.Count) + { + Vector128 rleVector = Sse2.LoadVector128(rlePtr + offset); + Vector128 oldVector = Sse2.LoadVector128(oldPtr + offset); + Vector128 resultVector = Sse2.Add(rleVector, oldVector); - while (iCopy < decodeStep) rleData[iCopy] += oldData[iCopy++]; + Sse2.Store(rlePtr + offset, resultVector); + } - outCache.Write(rleData); + while (offset < decodeStep) rleData[offset] += oldData[offset++]; + + outCache.Write(rleData); + } rleLoader.memCopyLength -= decodeStep; copyLength -= decodeStep;