Reimplement decodeASCII and decodeLatin1 to share C code

haskell · Aug 30, 2021 · b1226ff · b1226ff
1 parent 28a8cd0
commit b1226ff
Show file tree

Hide file tree

Showing 6 changed files with 103 additions and 34 deletions.
diff --git a/cbits/cbits.c b/cbits/cbits.c
@@ -55,26 +55,6 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
   return *state = utf8d[256 + *state + type];
 }
 
-size_t
-_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
-                       const uint8_t *srcend)
-{
-  const uint8_t *dest0 = dest;
-  const uint8_t *p = src;
-
-  while (p != srcend){
-    uint8_t codepoint = *p++;
-    if(codepoint < 0x80){
-      *dest++ = (uint8_t)codepoint;
-    } else {
-      *dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
-      *dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
-    }
-  }
-
-  return (dest - dest0);
-}
-
 /*
  * A best-effort decoder. Runs until it hits either end of input or
  * the start of an invalid byte sequence.

diff --git a/cbits/is_ascii.c b/cbits/is_ascii.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Andrew Lelechenko <[email protected]>
+ */
+
+#include <string.h>
+#include <stdint.h>
+#include <sys/types.h>
+#ifdef __x86_64__
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+#include <stdbool.h>
+
+/*
+  _hs_text_is_ascii takes a UTF-8 encoded buffer,
+  and returns the length of the ASCII-compatible prefix.
+*/
+const size_t _hs_text_is_ascii(const uint8_t *src0, const uint8_t *srcend){
+  const uint8_t *src = src0;
+
+#ifdef __x86_64__
+  // I experimented with larger vector registers,
+  // but did not notice any measurable speed up, so let's keep it simple.
+  while (src < srcend - 15){
+    __m128i w128 = _mm_loadu_si128((__m128i *)src);
+    // Which bytes are < 128?
+    uint16_t mask = _mm_movemask_epi8(w128);
+    if (mask) break;
+    src+= 16;
+  }
+#endif
+
+  while (src < srcend - 7){
+    uint64_t w64;
+    memcpy(&w64, src, sizeof(uint64_t));
+    if (w64 & 0x8080808080808080ULL) break;
+    src+= 8;
+  }
+
+  while (src < srcend){
+    uint8_t leadByte = *src;
+    if(leadByte >= 0x80) break;
+    src++;
+  }
+
+  return src - src0;
+}
diff --git a/src/Data/Text/Array.hs b/src/Data/Text/Array.hs
@@ -31,6 +31,7 @@ module Data.Text.Array
     , shrinkM
     , copyM
     , copyI
+    , copyP
     , empty
     , equal
     , compare
@@ -250,6 +251,22 @@ copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray s
       s2# -> (# s2#, () #)
 {-# INLINE copyI #-}
 
+-- | Copy from pointer.
+copyP :: MArray s               -- ^ Destination
+      -> Int                    -- ^ Destination offset
+      -> Ptr Word8              -- ^ Source
+      -> Int                    -- ^ Count
+      -> ST s ()
+copyP (MutableByteArray dst#) dstOff@(I# dstOff#) (Ptr src#) count@(I# count#)
+#if defined(ASSERTS)
+  | count < 0 = error $
+    "copyP: count must be >= 0, but got " ++ show count
+#endif
+  | otherwise = ST $ \s1# ->
+    case copyAddrToByteArray# src# dst# dstOff# count# s1# of
+      s2# -> (# s2#, () #)
+{-# INLINE copyP #-}
+
 -- | Compare portions of two arrays for equality.  No bounds checking
 -- is performed.
 equal :: Array -> Int -> Array -> Int -> Int -> Bool

diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
@@ -64,8 +64,10 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)
 
 import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
 import Control.Monad.ST (runST)
+import Data.Bits (shiftR, (.&.))
 import Data.ByteString as B
 import qualified Data.ByteString.Internal as B
+import qualified Data.ByteString.Short.Internal as SBS
 import Data.Foldable (traverse_)
 import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
 import Data.Text.Internal (Text(..), safe, text)
@@ -75,10 +77,10 @@ import Data.Text.Internal.Unsafe.Char (unsafeWrite)
 import Data.Text.Show ()
 import Data.Text.Unsafe (unsafeDupablePerformIO)
 import Data.Word (Word8, Word32)
-import Foreign.C.Types (CSize)
+import Foreign.C.Types (CSize(..))
 import Foreign.Marshal.Utils (with)
 import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
-import Foreign.Storable (Storable, peek, poke)
+import Foreign.Storable (Storable, peek, poke, peekByteOff)
 import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
 import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
 import qualified Data.ByteString.Builder as B
@@ -112,7 +114,13 @@ import GHC.Stack (HasCallStack)
 -- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 decodeASCII :: ByteString -> Text
-decodeASCII = decodeUtf8
+decodeASCII bs = withBS bs $ \fp len -> if len == 0 then Text A.empty 0 0 else runST $ do
+  asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+    c_is_ascii src (src `plusPtr` len)
+  if asciiPrefixLen == len
+  then let !(SBS.SBS arr) = SBS.toShort bs in
+        return (Text (A.ByteArray arr) 0 len)
+  else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
@@ -124,13 +132,29 @@ decodeLatin1 ::
   HasCallStack =>
 #endif
   ByteString -> Text
-decodeLatin1 bs = withBS bs aux where
-  aux fp len = text a 0 actualLen
-   where
-    (a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
-    go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \src -> do
-      destLen <- c_decode_latin1 dest src (src `plusPtr` len)
-      return (A.MutableByteArray dest, destLen)
+decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
+  dst <- A.new (2 * len)
+  let inner srcOff dstOff = if srcOff >= len then return dstOff else do
+        asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+          c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
+        if asciiPrefixLen == 0
+        then do
+          byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff
+          A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6))
+          A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
+          inner (srcOff + 1) (dstOff + 2)
+        else do
+          unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
+            unsafeSTToIO $ A.copyP dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
+          inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)
+
+  actualLen <- inner 0 0
+  dst' <- A.resizeM dst actualLen
+  arr <- A.unsafeFreeze dst'
+  return $ Text arr 0 actualLen
+
+foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
+    :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
@@ -538,6 +562,3 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat
     :: MutableByteArray# s -> Ptr CSize
     -> Ptr (Ptr Word8) -> Ptr Word8
     -> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)
-
-foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
-    :: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int
diff --git a/src/Data/Text/Lazy/Encoding.hs b/src/Data/Text/Lazy/Encoding.hs
@@ -1,5 +1,8 @@
 {-# LANGUAGE BangPatterns,CPP #-}
 {-# LANGUAGE Trustworthy #-}
+
+{-# OPTIONS_GHC -fno-warn-deprecations #-}
+
 -- |
 -- Module      : Data.Text.Lazy.Encoding
 -- Copyright   : (c) 2009, 2010 Bryan O'Sullivan
@@ -80,7 +83,7 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
 -- | /Deprecated/.  Decode a 'ByteString' containing 7-bit ASCII
 -- encoded text.
 decodeASCII :: B.ByteString -> Text
-decodeASCII = decodeUtf8
+decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
 {-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}
 
 -- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.

diff --git a/text.cabal b/text.cabal
@@ -65,6 +65,7 @@ flag developer
 
 library
   c-sources:    cbits/cbits.c
+                cbits/is_ascii.c
                 cbits/measure_off.c
                 cbits/reverse.c
                 cbits/utils.c