From 77775c51ac485635f53ad39d8292361683efbeba Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ethercrow@gmail.com>
Date: Sat, 3 Oct 2020 14:58:49 +0200
Subject: [PATCH 1/4] Use SSE2 in the x86_64 C version of decodeUtf8

---
 cbits/cbits.c | 52 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/cbits/cbits.c b/cbits/cbits.c
index 46357011..fd724d46 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -9,8 +9,14 @@
 #include <string.h>
 #include <stdint.h>
 #include <stdio.h>
+#if defined(__x86_64__)
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
 #include "text_cbits.h"
 
+
 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
 		     size_t n)
 {
@@ -157,24 +163,40 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
      */
 
     if (state == UTF8_ACCEPT) {
+#if defined(__x86_64__)
+      const __m128i zeros = _mm_set1_epi32(0);
+      while (s < srcend - 8) {
+        const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s);
+        if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL)
+          break;
+        s += 8;
+
+        /* Load 8 bytes of ASCII data */
+        const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
+        /* Interleave with zeros */
+        const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
+        /* Store the resulting 8 bytes into destination */
+        _mm_storeu_si128((__m128i *)d, eight_utf16_chars);
+        d += 8;
+      }
+#else  
       while (s < srcend - 4) {
-	codepoint = *((uint32_t *) s);
-	if ((codepoint & 0x80808080) != 0)
-	  break;
-	s += 4;
-
-	/*
-	 * Tried 32-bit stores here, but the extra bit-twiddling
-	 * slowed the code down.
-	 */
-
-	*d++ = (uint16_t) (codepoint & 0xff);
-	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
-	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
-	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
+        codepoint = *((uint32_t *) s);
+        if ((codepoint & 0x80808080) != 0)
+          break;
+        s += 4;
+        /*
+         * Tried 32-bit stores here, but the extra bit-twiddling
+         * slowed the code down.
+         */
+        *d++ = (uint16_t) (codepoint & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
       }
+#endif
       last = s;
-    }
+    } /* end if (state == UTF8_ACCEPT) */
 #endif
 
     if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {

From eaab373151b2d2e009c34089d1d37bbb70ac61dd Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ethercrow@gmail.com>
Date: Mon, 21 Sep 2020 23:31:30 +0200
Subject: [PATCH 2/4] Use SSE2 in the x86_64 C version of decodeLatin1

---
 benchmarks/haskell/Benchmarks.hs            |  1 +
 benchmarks/haskell/Benchmarks/DecodeUtf8.hs | 12 ++++++++++++
 cbits/cbits.c                               | 19 ++++++++++++++++++-
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs
index 91b429db..6c03f6f6 100644
--- a/benchmarks/haskell/Benchmarks.hs
+++ b/benchmarks/haskell/Benchmarks.hs
@@ -44,6 +44,7 @@ main = do
         , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmark "ascii")
         , env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark  "russian")
         , env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese")
+        , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII)
         , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
         , env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark
         , FileRead.benchmark (tf "russian.txt")
diff --git a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
index e7a0d7a7..22418c6c 100644
--- a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
+++ b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
@@ -17,6 +17,7 @@
 module Benchmarks.DecodeUtf8
     ( initEnv
     , benchmark
+    , benchmarkASCII
     ) where
 
 import Foreign.C.Types
@@ -62,6 +63,17 @@ benchmark kind ~(bs, lbs) =
         , bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs
         ]
 
+benchmarkASCII :: Env -> Benchmark
+benchmarkASCII ~(bs, lbs) =
+    bgroup "DecodeASCII"
+        [ C.bench "strict decodeUtf8" $ nf T.decodeUtf8 bs
+        , C.bench "strict decodeLatin1" $ nf T.decodeLatin1 bs
+        , C.bench "strict decodeASCII" $ nf T.decodeASCII bs
+        , C.bench "lazy decodeUtf8" $ nf TL.decodeUtf8 lbs
+        , C.bench "lazy decodeLatin1" $ nf TL.decodeLatin1 lbs
+        , C.bench "lazy decodeASCII" $ nf TL.decodeASCII lbs
+        ]
+
 iconv :: B.ByteString -> IO CInt
 iconv (PS fp off len) = withForeignPtr fp $ \ptr ->
                         time_iconv (ptr `plusPtr` off) (fromIntegral len)
diff --git a/cbits/cbits.c b/cbits/cbits.c
index fd724d46..ca7a7aeb 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -16,7 +16,6 @@
 
 #include "text_cbits.h"
 
-
 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
 		     size_t n)
 {
@@ -88,6 +87,23 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
   while (p != srcend && (uintptr_t)p & 0x3)
     *dest++ = *p++;
 
+#if defined(__x86_64__)
+  /* All the intrinsics used here are from SSE2,
+   * so every x86_64 CPU supports them.
+   */
+  const __m128i zeros = _mm_set1_epi32(0);
+  while (p < srcend - 7) {
+    /* Load 8 bytes of ASCII data */
+    const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
+    /* Interleave with zeros */
+    const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
+    /* Store the resulting 16 bytes into destination */
+    _mm_storeu_si128((__m128i *)dest, utf16);
+
+    dest += 8;
+    p += 8;
+  }
+#else
   /* iterate over 32-bit aligned loads */
   while (p < srcend - 3) {
     const uint32_t w = *((const uint32_t *)p);
@@ -99,6 +115,7 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
 
     p += 4;
   }
+#endif
 #endif
 
   /* handle unaligned suffix */

From 0ad08203d1a432ff0ff4b3c70f951dec31ae7bc9 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ethercrow@gmail.com>
Date: Wed, 7 Oct 2020 22:44:53 +0200
Subject: [PATCH 3/4] Use SSE2 in the x86_64 C version of encodeUtf8

---
 benchmarks/haskell/Benchmarks.hs            |  3 +-
 benchmarks/haskell/Benchmarks/EncodeUtf8.hs |  8 ++--
 cbits/cbits.c                               | 41 ++++++++++++---------
 3 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs
index 6c03f6f6..55707964 100644
--- a/benchmarks/haskell/Benchmarks.hs
+++ b/benchmarks/haskell/Benchmarks.hs
@@ -45,7 +45,8 @@ main = do
         , env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark  "russian")
         , env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese")
         , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII)
-        , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
+        , EncodeUtf8.benchmark "non-ASCII" "επανάληψη 竺法蘭共譯"
+        , EncodeUtf8.benchmark "ASCII" "lorem ipsum"
         , env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark
         , FileRead.benchmark (tf "russian.txt")
         , FoldLines.benchmark (tf "russian.txt")
diff --git a/benchmarks/haskell/Benchmarks/EncodeUtf8.hs b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs
index 2faf339e..ff9a4d8e 100644
--- a/benchmarks/haskell/Benchmarks/EncodeUtf8.hs
+++ b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs
@@ -18,11 +18,11 @@ import qualified Data.Text.Encoding as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TL
 
-benchmark :: String -> Benchmark
-benchmark string =
+benchmark :: String -> String -> Benchmark
+benchmark name string =
     bgroup "EncodeUtf8"
-        [ bench "Text"     $ whnf (B.length . T.encodeUtf8)   text
-        , bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText
+        [ bench ("Text (" ++ name ++ ")")     $ whnf (B.length . T.encodeUtf8)   text
+        , bench ("LazyText (" ++ name ++ ")") $ whnf (BL.length . TL.encodeUtf8) lazyText
         ]
   where
     -- The string in different formats
diff --git a/cbits/cbits.c b/cbits/cbits.c
index ca7a7aeb..f1bdc92c 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -276,29 +276,36 @@ _hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
 
  ascii:
 #if defined(__x86_64__)
-  while (srcend - src >= 4) {
-    uint64_t w = *((uint64_t *) src);
+  while (srcend - src >= 8) {
+    union { uint64_t halves[2]; __m128i whole; } eight_chars;
+    eight_chars.whole = _mm_loadu_si128((__m128i *) src);
 
+    const uint64_t w = eight_chars.halves[0];
     if (w & 0xFF80FF80FF80FF80ULL) {
       if (!(w & 0x000000000000FF80ULL)) {
-	*dest++ = w & 0xFFFF;
-	src++;
-	if (!(w & 0x00000000FF800000ULL)) {
-	  *dest++ = (w >> 16) & 0xFFFF;
-	  src++;
-	  if (!(w & 0x0000FF8000000000ULL)) {
-	    *dest++ = (w >> 32) & 0xFFFF;
-	    src++;
-	  }
-	}
+        *dest++ = w & 0xFFFF;
+        src++;
+        if (!(w & 0x00000000FF800000ULL)) {
+          *dest++ = (w >> 16) & 0xFFFF;
+          src++;
+          if (!(w & 0x0000FF8000000000ULL)) {
+            *dest++ = (w >> 32) & 0xFFFF;
+            src++;
+          }
+        }
       }
       break;
     }
-    *dest++ = w & 0xFFFF;
-    *dest++ = (w >> 16) & 0xFFFF;
-    *dest++ = (w >> 32) & 0xFFFF;
-    *dest++ = w >> 48;
-    src += 4;
+
+    if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) {
+      break;
+    }
+
+    const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole);
+    _mm_storel_epi64((__m128i *)dest, eight_ascii_chars);
+
+    dest += 8;
+    src += 8;
   }
 #endif
 

From d94d2ef10ba0d834255ac535d9386655fefaccf6 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ethercrow@gmail.com>
Date: Tue, 20 Apr 2021 16:21:51 +0200
Subject: [PATCH 4/4] Update cbits/cbits.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Kubo Kováč <733205+kuk0@users.noreply.github.com>
---
 cbits/cbits.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cbits/cbits.c b/cbits/cbits.c
index f1bdc92c..11b989e3 100644
--- a/cbits/cbits.c
+++ b/cbits/cbits.c
@@ -192,7 +192,7 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
         const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
         /* Interleave with zeros */
         const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
-        /* Store the resulting 8 bytes into destination */
+        /* Store the resulting 16 bytes into destination */
         _mm_storeu_si128((__m128i *)d, eight_utf16_chars);
         d += 8;
       }