From 1a1d1684776e9c206f9bed54e7d1742627e76c2b Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Fri, 23 Oct 2020 18:48:52 +0200 Subject: [PATCH] Implement intersperse using SSE2 --- bench/BenchAll.hs | 4 ++++ cbits/fpstring.c | 20 +++++++++++++++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 40d76007f..79eb4d566 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -361,6 +361,10 @@ main = do , benchFE "floatHexFixed" $ fromIntegral >$< P.floatHexFixed , benchFE "doubleHexFixed" $ fromIntegral >$< P.doubleHexFixed ] + , bgroup "intersperse" + [ bench "intersperse" $ whnf (S.intersperse 32) byteStringData + , bench "intersperse (unaligned)" $ whnf (S.intersperse 32) (S.drop 1 byteStringData) + ] , bgroup "partition" [ bgroup "strict" diff --git a/cbits/fpstring.c b/cbits/fpstring.c index f82bc00e5..9beb00d99 100644 --- a/cbits/fpstring.c +++ b/cbits/fpstring.c @@ -30,6 +30,10 @@ */ #include "fpstring.h" +#if defined(__x86_64__) +#include +#include +#endif /* copy a string in reverse */ void fps_reverse(unsigned char *q, unsigned char *p, size_t n) { @@ -44,7 +48,21 @@ void fps_intersperse(unsigned char *q, unsigned char *p, size_t n, unsigned char c) { - +#if defined(__x86_64__) + { + const __m128i separator = _mm_set1_epi8(c); + const unsigned char *const p_begin = p; + const unsigned char *const p_end = p_begin + n - 9; + while (p < p_end) { + const __m128i eight_src_bytes = _mm_loadl_epi64((__m128i *)p); + const __m128i sixteen_dst_bytes = _mm_unpacklo_epi8(eight_src_bytes, separator); + _mm_storeu_si128((__m128i *)q, sixteen_dst_bytes); + p += 8; + q += 16; + } + n -= p - p_begin; + } +#endif while (n > 1) { *q++ = *p++; *q++ = c;