haskell · Bodigrim · Apr 25, 2021 · Oct 3, 2020 · Sep 21, 2020 · Oct 7, 2020
diff --git a/benchmarks/haskell/Benchmarks.hs b/benchmarks/haskell/Benchmarks.hs
@@ -44,7 +44,9 @@ main = do
         , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmark "ascii")
         , env (DecodeUtf8.initEnv (tf "russian.txt")) (DecodeUtf8.benchmark  "russian")
         , env (DecodeUtf8.initEnv (tf "japanese.txt")) (DecodeUtf8.benchmark "japanese")
-        , EncodeUtf8.benchmark "επανάληψη 竺法蘭共譯"
+        , env (DecodeUtf8.initEnv (tf "ascii.txt")) (DecodeUtf8.benchmarkASCII)
+        , EncodeUtf8.benchmark "non-ASCII" "επανάληψη 竺法蘭共譯"
+        , EncodeUtf8.benchmark "ASCII" "lorem ipsum"
         , env (Equality.initEnv (tf "japanese.txt")) Equality.benchmark
         , FileRead.benchmark (tf "russian.txt")
         , FoldLines.benchmark (tf "russian.txt")

diff --git a/benchmarks/haskell/Benchmarks/DecodeUtf8.hs b/benchmarks/haskell/Benchmarks/DecodeUtf8.hs
@@ -17,6 +17,7 @@
 module Benchmarks.DecodeUtf8
     ( initEnv
     , benchmark
+    , benchmarkASCII
     ) where
 
 import Foreign.C.Types
@@ -62,6 +63,17 @@ benchmark kind ~(bs, lbs) =
         , bench "LazyInitLength" $ nf (TL.length . TL.init . TL.decodeUtf8) lbs
         ]
 
+benchmarkASCII :: Env -> Benchmark
+benchmarkASCII ~(bs, lbs) =
+    bgroup "DecodeASCII"
+        [ C.bench "strict decodeUtf8" $ nf T.decodeUtf8 bs
+        , C.bench "strict decodeLatin1" $ nf T.decodeLatin1 bs
+        , C.bench "strict decodeASCII" $ nf T.decodeASCII bs
+        , C.bench "lazy decodeUtf8" $ nf TL.decodeUtf8 lbs
+        , C.bench "lazy decodeLatin1" $ nf TL.decodeLatin1 lbs
+        , C.bench "lazy decodeASCII" $ nf TL.decodeASCII lbs
+        ]
+
 iconv :: B.ByteString -> IO CInt
 iconv (PS fp off len) = withForeignPtr fp $ \ptr ->
                         time_iconv (ptr `plusPtr` off) (fromIntegral len)

diff --git a/benchmarks/haskell/Benchmarks/EncodeUtf8.hs b/benchmarks/haskell/Benchmarks/EncodeUtf8.hs
@@ -18,11 +18,11 @@ import qualified Data.Text.Encoding as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TL
 
-benchmark :: String -> Benchmark
-benchmark string =
+benchmark :: String -> String -> Benchmark
+benchmark name string =
     bgroup "EncodeUtf8"
-        [ bench "Text"     $ whnf (B.length . T.encodeUtf8)   text
-        , bench "LazyText" $ whnf (BL.length . TL.encodeUtf8) lazyText
+        [ bench ("Text (" ++ name ++ ")")     $ whnf (B.length . T.encodeUtf8)   text
+        , bench ("LazyText (" ++ name ++ ")") $ whnf (BL.length . TL.encodeUtf8) lazyText
         ]
   where
     -- The string in different formats

diff --git a/cbits/cbits.c b/cbits/cbits.c
@@ -9,6 +9,11 @@
 #include <string.h>
 #include <stdint.h>
 #include <stdio.h>
+#if defined(__x86_64__)
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
 #include "text_cbits.h"
 
 void _hs_text_memcpy(void *dest, size_t doff, const void *src, size_t soff,
@@ -82,6 +87,23 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
   while (p != srcend && (uintptr_t)p & 0x3)
     *dest++ = *p++;
 
+#if defined(__x86_64__)
+  /* All the intrinsics used here are from SSE2,
+   * so every x86_64 CPU supports them.
+   */
+  const __m128i zeros = _mm_set1_epi32(0);
+  while (p < srcend - 7) {
+    /* Load 8 bytes of ASCII data */
+    const __m128i ascii = _mm_cvtsi64_si128(*((const uint64_t *)p));
+    /* Interleave with zeros */
+    const __m128i utf16 = _mm_unpacklo_epi8(ascii, zeros);
+    /* Store the resulting 16 bytes into destination */
+    _mm_storeu_si128((__m128i *)dest, utf16);
+
+    dest += 8;
+    p += 8;
+  }
+#else
   /* iterate over 32-bit aligned loads */
   while (p < srcend - 3) {
     const uint32_t w = *((const uint32_t *)p);
@@ -93,6 +115,7 @@ _hs_text_decode_latin1(uint16_t *dest, const uint8_t *src,
 
     p += 4;
   }
+#endif
 #endif
 
   /* handle unaligned suffix */
@@ -157,24 +180,40 @@ _hs_text_decode_utf8_int(uint16_t *const dest, size_t *destoff,
      */
 
     if (state == UTF8_ACCEPT) {
+#if defined(__x86_64__)
+      const __m128i zeros = _mm_set1_epi32(0);
+      while (s < srcend - 8) {
+        const uint64_t hopefully_eight_ascii_chars = *((uint64_t *) s);
+        if ((hopefully_eight_ascii_chars & 0x8080808080808080LL) != 0LL)
+          break;
+        s += 8;
+
+        /* Load 8 bytes of ASCII data */
+        const __m128i eight_ascii_chars = _mm_cvtsi64_si128(hopefully_eight_ascii_chars);
+        /* Interleave with zeros */
+        const __m128i eight_utf16_chars = _mm_unpacklo_epi8(eight_ascii_chars, zeros);
+        /* Store the resulting 16 bytes into destination */
+        _mm_storeu_si128((__m128i *)d, eight_utf16_chars);
+        d += 8;
+      }
+#else  
       while (s < srcend - 4) {
-	codepoint = *((uint32_t *) s);
-	if ((codepoint & 0x80808080) != 0)
-	  break;
-	s += 4;
-
-	/*
-	 * Tried 32-bit stores here, but the extra bit-twiddling
-	 * slowed the code down.
-	 */
-
-	*d++ = (uint16_t) (codepoint & 0xff);
-	*d++ = (uint16_t) ((codepoint >> 8) & 0xff);
-	*d++ = (uint16_t) ((codepoint >> 16) & 0xff);
-	*d++ = (uint16_t) ((codepoint >> 24) & 0xff);
+        codepoint = *((uint32_t *) s);
+        if ((codepoint & 0x80808080) != 0)
+          break;
+        s += 4;
+        /*
+         * Tried 32-bit stores here, but the extra bit-twiddling
+         * slowed the code down.
+         */
+        *d++ = (uint16_t) (codepoint & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 8) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 16) & 0xff);
+        *d++ = (uint16_t) ((codepoint >> 24) & 0xff);
       }
+#endif
       last = s;
-    }
+    } /* end if (state == UTF8_ACCEPT) */
 #endif
 
     if (decode(&state, &codepoint, *s++) != UTF8_ACCEPT) {
@@ -237,29 +276,36 @@ _hs_text_encode_utf8(uint8_t **destp, const uint16_t *src, size_t srcoff,
 
  ascii:
 #if defined(__x86_64__)
-  while (srcend - src >= 4) {
-    uint64_t w = *((uint64_t *) src);
+  while (srcend - src >= 8) {
+    union { uint64_t halves[2]; __m128i whole; } eight_chars;
+    eight_chars.whole = _mm_loadu_si128((__m128i *) src);
 
+    const uint64_t w = eight_chars.halves[0];
     if (w & 0xFF80FF80FF80FF80ULL) {
       if (!(w & 0x000000000000FF80ULL)) {
-	*dest++ = w & 0xFFFF;
-	src++;
-	if (!(w & 0x00000000FF800000ULL)) {
-	  *dest++ = (w >> 16) & 0xFFFF;
-	  src++;
-	  if (!(w & 0x0000FF8000000000ULL)) {
-	    *dest++ = (w >> 32) & 0xFFFF;
-	    src++;
-	  }
-	}
+        *dest++ = w & 0xFFFF;
+        src++;
+        if (!(w & 0x00000000FF800000ULL)) {
+          *dest++ = (w >> 16) & 0xFFFF;
+          src++;
+          if (!(w & 0x0000FF8000000000ULL)) {
+            *dest++ = (w >> 32) & 0xFFFF;
+            src++;
+          }
+        }
       }
       break;
     }
-    *dest++ = w & 0xFFFF;
-    *dest++ = (w >> 16) & 0xFFFF;
-    *dest++ = (w >> 32) & 0xFFFF;
-    *dest++ = w >> 48;
-    src += 4;
+
+    if (eight_chars.halves[1] & 0xFF80FF80FF80FF80ULL) {
+      break;
+    }
+
+    const __m128i eight_ascii_chars = _mm_packus_epi16(eight_chars.whole, eight_chars.whole);
+    _mm_storel_epi64((__m128i *)dest, eight_ascii_chars);
+
+    dest += 8;
+    src += 8;
   }
 #endif