From 77775c51ac485635f53ad39d8292361683efbeba Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Sat, 3 Oct 2020 14:58:49 +0200 Subject: [PATCH 1/4] Use SSE2 in the x86_64 C version of decodeUtf8 --- cbits/cbits.c | 52 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/cbits/cbits.c b/cbits/cbits.c index 46357011..fd724d46 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -9,8 +9,14 @@ #include #include #include +#if defined(__x86_64__) +#include +#include +#endif + #include "text_cbits.h" + void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff, size_t n) { @@ -157,24 +163,40 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, */ if (state == UTF8_ACCEPT) { +#if defined(__x86_64__) + const __m128i zeros = _mm_set1_epi32(0); + while (s < srcend - 8) { + const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s); + if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL) + break; + s += 8; + + /* Load 8 bytes of ASCII data */ + const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars); + /* Interleave with zeros */ + const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros); + /* Store the resulting 8 bytes into destination */ + _mm_storeu_si128((__m128i *)d, eight_utf16_chars); + d += 8; + } +#else while (s < srcend - 4) { - codepoint = *((uint32_t *) s); - if ((codepoint & 0x80808080) != 0) - break; - s += 4; - - /* - * Tried 32-bit stores here, but the extra bit-twiddling - * slowed the code down. - */ - - *d++ = (uint16_t) (codepoint & 0xff); - *d++ = (uint16_t) ((codepoint >> 8) & 0xff); - *d++ = (uint16_t) ((codepoint >> 16) & 0xff); - *d++ = (uint16_t) ((codepoint >> 24) & 0xff); + codepoint = *((uint32_t *) s); + if ((codepoint & 0x80808080) != 0) + break; + s += 4; + /* + * Tried 32-bit stores here, but the extra bit-twiddling + * slowed the code down. + */ + *d++ = (uint16_t) (codepoint & 0xff); + *d++ = (uint16_t) ((codepoint >> 8) & 0xff); + *d++ = (uint16_t) ((codepoint >> 16) & 0xff); + *d++ = (uint16_t) ((codepoint >> 24) & 0xff); } +#endif last = s; - } + } /* end if (state == UTF8_ACCEPT) */ #endif if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) { From eaab373151b2d2e009c34089d1d37bbb70ac61dd Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 21 Sep 2020 23:31:30 +0200 Subject: [PATCH 2/4] Use SSE2 in the x86_64 C version of decodeLatin1 --- benchmarks/haskell/Benchmarks.hs | 1 + benchmarks/haskell/Benchmarks/DecodeUtf8.hs | 12 ++++++++++++ cbits/cbits.c | 19 ++++++++++++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs index 91b429db..6c03f6f6 100644 --- a/benchmarks/haskell/Benchmarks.hs +++ b/benchmarks/haskell/Benchmarks.hs @@ -44,6 +44,7 @@ main = do , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmark "ascii") , env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark "russian") , env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese") + , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII) , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯" , env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark , FileRead.benchmark (tf "russian.txt") diff --git a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs index e7a0d7a7..22418c6c 100644 --- a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs +++ b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs @@ -17,6 +17,7 @@ module Benchmarks.DecodeUtf8 ( initEnv , benchmark + , benchmarkASCII ) where import Foreign.C.Types @@ -62,6 +63,17 @@ benchmark kind ~(bs, lbs) = , bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs ] +benchmarkASCII :: Env -> Benchmark +benchmarkASCII ~(bs, lbs) = + bgroup "DecodeASCII" + [ C.bench "strict decodeUtf8" $ nf T.decodeUtf8 bs + , C.bench "strict decodeLatin1" $ nf T.decodeLatin1 bs + , C.bench "strict decodeASCII" $ nf T.decodeASCII bs + , C.bench "lazy decodeUtf8" $ nf TL.decodeUtf8 lbs + , C.bench "lazy decodeLatin1" $ nf TL.decodeLatin1 lbs + , C.bench "lazy decodeASCII" $ nf TL.decodeASCII lbs + ] + iconv :: B.ByteString -> IO CInt iconv (PS fp off len) = withForeignPtr fp $ \ptr -> time_iconv (ptr `plusPtr` off) (fromIntegral len) diff --git a/cbits/cbits.c b/cbits/cbits.c index fd724d46..ca7a7aeb 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -16,7 +16,6 @@ #include "text_cbits.h" - void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff, size_t n) { @@ -88,6 +87,23 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src, while (p != srcend && (uintptr_t)p & 0x3) *dest++ = *p++; +#if defined(__x86_64__) + /* All the intrinsics used here are from SSE2, + * so every x86_64 CPU supports them. + */ + const __m128i zeros = _mm_set1_epi32(0); + while (p < srcend - 7) { + /* Load 8 bytes of ASCII data */ + const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p)); + /* Interleave with zeros */ + const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros); + /* Store the resulting 16 bytes into destination */ + _mm_storeu_si128((__m128i *)dest, utf16); + + dest += 8; + p += 8; + } +#else /* iterate over 32-bit aligned loads */ while (p < srcend - 3) { const uint32_t w = *((const uint32_t *)p); @@ -99,6 +115,7 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src, p += 4; } +#endif #endif /* handle unaligned suffix */ From 0ad08203d1a432ff0ff4b3c70f951dec31ae7bc9 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 7 Oct 2020 22:44:53 +0200 Subject: [PATCH 3/4] Use SSE2 in the x86_64 C version of encodeUtf8 --- benchmarks/haskell/Benchmarks.hs | 3 +- benchmarks/haskell/Benchmarks/EncodeUtf8.hs | 8 ++-- cbits/cbits.c | 41 ++++++++++++--------- 3 files changed, 30 insertions(+), 22 deletions(-) diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs index 6c03f6f6..55707964 100644 --- a/benchmarks/haskell/Benchmarks.hs +++ b/benchmarks/haskell/Benchmarks.hs @@ -45,7 +45,8 @@ main = do , env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark "russian") , env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese") , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII) - , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯" + , EncodeUtf8.benchmark "non-ASCII" "επανάληψη 竺法蘭共譯" + , EncodeUtf8.benchmark "ASCII" "lorem ipsum" , env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark , FileRead.benchmark (tf "russian.txt") , FoldLines.benchmark (tf "russian.txt") diff --git a/benchmarks/haskell/Benchmarks/EncodeUtf8.hs b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs index 2faf339e..ff9a4d8e 100644 --- a/benchmarks/haskell/Benchmarks/EncodeUtf8.hs +++ b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs @@ -18,11 +18,11 @@ import qualified Data.Text.Encoding as T import qualified Data.Text.Lazy as TL import qualified Data.Text.Lazy.Encoding as TL -benchmark :: String -> Benchmark -benchmark string = +benchmark :: String -> String -> Benchmark +benchmark name string = bgroup "EncodeUtf8" - [ bench "Text" $ whnf (B.length . T.encodeUtf8) text - , bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText + [ bench ("Text (" ++ name ++ ")") $ whnf (B.length . T.encodeUtf8) text + , bench ("LazyText (" ++ name ++ ")") $ whnf (BL.length . TL.encodeUtf8) lazyText ] where -- The string in different formats diff --git a/cbits/cbits.c b/cbits/cbits.c index ca7a7aeb..f1bdc92c 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -276,29 +276,36 @@ _hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff, ascii: #if defined(__x86_64__) - while (srcend - src >= 4) { - uint64_t w = *((uint64_t *) src); + while (srcend - src >= 8) { + union { uint64_t halves[2]; __m128i whole; } eight_chars; + eight_chars.whole = _mm_loadu_si128((__m128i *) src); + const uint64_t w = eight_chars.halves[0]; if (w & 0xFF80FF80FF80FF80ULL) { if (!(w & 0x000000000000FF80ULL)) { - *dest++ = w & 0xFFFF; - src++; - if (!(w & 0x00000000FF800000ULL)) { - *dest++ = (w >> 16) & 0xFFFF; - src++; - if (!(w & 0x0000FF8000000000ULL)) { - *dest++ = (w >> 32) & 0xFFFF; - src++; - } - } + *dest++ = w & 0xFFFF; + src++; + if (!(w & 0x00000000FF800000ULL)) { + *dest++ = (w >> 16) & 0xFFFF; + src++; + if (!(w & 0x0000FF8000000000ULL)) { + *dest++ = (w >> 32) & 0xFFFF; + src++; + } + } } break; } - *dest++ = w & 0xFFFF; - *dest++ = (w >> 16) & 0xFFFF; - *dest++ = (w >> 32) & 0xFFFF; - *dest++ = w >> 48; - src += 4; + + if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) { + break; + } + + const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole); + _mm_storel_epi64((__m128i *)dest, eight_ascii_chars); + + dest += 8; + src += 8; } #endif From d94d2ef10ba0d834255ac535d9386655fefaccf6 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Tue, 20 Apr 2021 16:21:51 +0200 Subject: [PATCH 4/4] Update cbits/cbits.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Kubo Kováč <733205+kuk0@users.noreply.github.com> --- cbits/cbits.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cbits/cbits.c b/cbits/cbits.c index f1bdc92c..11b989e3 100644 --- a/cbits/cbits.c +++ b/cbits/cbits.c @@ -192,7 +192,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff, const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars); /* Interleave with zeros */ const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros); - /* Store the resulting 8 bytes into destination */ + /* Store the resulting 16 bytes into destination */ _mm_storeu_si128((__m128i *)d, eight_utf16_chars); d += 8; }