Skip to content

Commit

Permalink
Reimplement decodeASCII and decodeLatin1 to share C code
Browse files Browse the repository at this point in the history
  • Loading branch information
Bodigrim committed Aug 30, 2021
1 parent 28a8cd0 commit b1226ff
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 34 deletions.
20 changes: 0 additions & 20 deletions cbits/cbits.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,26 +55,6 @@ decode(uint32_t *state, uint32_t* codep, uint32_t byte) {
return *state = utf8d[256 + *state + type];
}

size_t
_hs_text_decode_latin1(uint8_t *dest, const uint8_t *src,
const uint8_t *srcend)
{
const uint8_t *dest0 = dest;
const uint8_t *p = src;

while (p != srcend){
uint8_t codepoint = *p++;
if(codepoint < 0x80){
*dest++ = (uint8_t)codepoint;
} else {
*dest++ = (uint8_t) (0xC0 + (codepoint >> 6));
*dest++ = (uint8_t) (0x80 + (codepoint & 0x3F));
}
}

return (dest - dest0);
}

/*
* A best-effort decoder. Runs until it hits either end of input or
* the start of an invalid byte sequence.
Expand Down
47 changes: 47 additions & 0 deletions cbits/is_ascii.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2021 Andrew Lelechenko <[email protected]>
*/

#include <string.h>
#include <stdint.h>
#include <sys/types.h>
#ifdef __x86_64__
#include <emmintrin.h>
#include <xmmintrin.h>
#endif
#include <stdbool.h>

/*
_hs_text_is_ascii takes a UTF-8 encoded buffer,
and returns the length of the ASCII-compatible prefix.
*/
const size_t _hs_text_is_ascii(const uint8_t *src0, const uint8_t *srcend){
const uint8_t *src = src0;

#ifdef __x86_64__
// I experimented with larger vector registers,
// but did not notice any measurable speed up, so let's keep it simple.
while (src < srcend - 15){
__m128i w128 = _mm_loadu_si128((__m128i *)src);
// Which bytes are < 128?
uint16_t mask = _mm_movemask_epi8(w128);
if (mask) break;
src+= 16;
}
#endif

while (src < srcend - 7){
uint64_t w64;
memcpy(&w64, src, sizeof(uint64_t));
if (w64 & 0x8080808080808080ULL) break;
src+= 8;
}

while (src < srcend){
uint8_t leadByte = *src;
if(leadByte >= 0x80) break;
src++;
}

return src - src0;
}
17 changes: 17 additions & 0 deletions src/Data/Text/Array.hs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ module Data.Text.Array
, shrinkM
, copyM
, copyI
, copyP
, empty
, equal
, compare
Expand Down Expand Up @@ -250,6 +251,22 @@ copyI count@(I# count#) (MutableByteArray dst#) dstOff@(I# dstOff#) (ByteArray s
s2# -> (# s2#, () #)
{-# INLINE copyI #-}

-- | Copy from pointer.
copyP :: MArray s -- ^ Destination
-> Int -- ^ Destination offset
-> Ptr Word8 -- ^ Source
-> Int -- ^ Count
-> ST s ()
copyP (MutableByteArray dst#) dstOff@(I# dstOff#) (Ptr src#) count@(I# count#)
#if defined(ASSERTS)
| count < 0 = error $
"copyP: count must be >= 0, but got " ++ show count
#endif
| otherwise = ST $ \s1# ->
case copyAddrToByteArray# src# dst# dstOff# count# s1# of
s2# -> (# s2#, () #)
{-# INLINE copyP #-}

-- | Compare portions of two arrays for equality. No bounds checking
-- is performed.
equal :: Array -> Int -> Array -> Int -> Int -> Bool
Expand Down
47 changes: 34 additions & 13 deletions src/Data/Text/Encoding.hs
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,10 @@ import Control.Monad.ST.Unsafe (unsafeIOToST, unsafeSTToIO)

import Control.Exception (evaluate, try, throwIO, ErrorCall(ErrorCall))
import Control.Monad.ST (runST)
import Data.Bits (shiftR, (.&.))
import Data.ByteString as B
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString.Short.Internal as SBS
import Data.Foldable (traverse_)
import Data.Text.Encoding.Error (OnDecodeError, UnicodeException, strictDecode, lenientDecode)
import Data.Text.Internal (Text(..), safe, text)
Expand All @@ -75,10 +77,10 @@ import Data.Text.Internal.Unsafe.Char (unsafeWrite)
import Data.Text.Show ()
import Data.Text.Unsafe (unsafeDupablePerformIO)
import Data.Word (Word8, Word32)
import Foreign.C.Types (CSize)
import Foreign.C.Types (CSize(..))
import Foreign.Marshal.Utils (with)
import Foreign.Ptr (Ptr, minusPtr, nullPtr, plusPtr)
import Foreign.Storable (Storable, peek, poke)
import Foreign.Storable (Storable, peek, poke, peekByteOff)
import GHC.Exts (MutableByteArray#, byteArrayContents#, unsafeCoerce#)
import GHC.ForeignPtr (ForeignPtr(..), ForeignPtrContents(PlainPtr))
import qualified Data.ByteString.Builder as B
Expand Down Expand Up @@ -112,7 +114,13 @@ import GHC.Stack (HasCallStack)
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: ByteString -> Text
decodeASCII = decodeUtf8
decodeASCII bs = withBS bs $ \fp len -> if len == 0 then Text A.empty 0 0 else runST $ do
asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
c_is_ascii src (src `plusPtr` len)
if asciiPrefixLen == len
then let !(SBS.SBS arr) = SBS.toShort bs in
return (Text (A.ByteArray arr) 0 len)
else error $ "decodeASCII: detected non-ASCII codepoint at " ++ show asciiPrefixLen
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}

-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
Expand All @@ -124,13 +132,29 @@ decodeLatin1 ::
HasCallStack =>
#endif
ByteString -> Text
decodeLatin1 bs = withBS bs aux where
aux fp len = text a 0 actualLen
where
(a, actualLen) = A.run2 (A.new (2 * len) >>= unsafeIOToST . go)
go (A.MutableByteArray dest) = unsafeWithForeignPtr fp $ \src -> do
destLen <- c_decode_latin1 dest src (src `plusPtr` len)
return (A.MutableByteArray dest, destLen)
decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
dst <- A.new (2 * len)
let inner srcOff dstOff = if srcOff >= len then return dstOff else do
asciiPrefixLen <- fmap cSizeToInt $ unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
c_is_ascii (src `plusPtr` srcOff) (src `plusPtr` len)
if asciiPrefixLen == 0
then do
byte <- unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> peekByteOff src srcOff
A.unsafeWrite dst dstOff (0xC0 + (byte `shiftR` 6))
A.unsafeWrite dst (dstOff + 1) (0x80 + (byte .&. 0x3F))
inner (srcOff + 1) (dstOff + 2)
else do
unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
unsafeSTToIO $ A.copyP dst dstOff (src `plusPtr` srcOff) asciiPrefixLen
inner (srcOff + asciiPrefixLen) (dstOff + asciiPrefixLen)

actualLen <- inner 0 0
dst' <- A.resizeM dst actualLen
arr <- A.unsafeFreeze dst'
return $ Text arr 0 actualLen

foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
:: Ptr Word8 -> Ptr Word8 -> IO CSize

-- | Decode a 'ByteString' containing UTF-8 encoded text.
--
Expand Down Expand Up @@ -538,6 +562,3 @@ foreign import ccall unsafe "_hs_text_decode_utf8_state" c_decode_utf8_with_stat
:: MutableByteArray# s -> Ptr CSize
-> Ptr (Ptr Word8) -> Ptr Word8
-> Ptr CodePoint -> Ptr DecoderState -> IO (Ptr Word8)

foreign import ccall unsafe "_hs_text_decode_latin1" c_decode_latin1
:: MutableByteArray# s -> Ptr Word8 -> Ptr Word8 -> IO Int
5 changes: 4 additions & 1 deletion src/Data/Text/Lazy/Encoding.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
{-# LANGUAGE BangPatterns,CPP #-}
{-# LANGUAGE Trustworthy #-}

{-# OPTIONS_GHC -fno-warn-deprecations #-}

-- |
-- Module : Data.Text.Lazy.Encoding
-- Copyright : (c) 2009, 2010 Bryan O'Sullivan
Expand Down Expand Up @@ -80,7 +83,7 @@ import Data.Text.Unsafe (unsafeDupablePerformIO)
-- | /Deprecated/. Decode a 'ByteString' containing 7-bit ASCII
-- encoded text.
decodeASCII :: B.ByteString -> Text
decodeASCII = decodeUtf8
decodeASCII = foldr (chunk . TE.decodeASCII) empty . B.toChunks
{-# DEPRECATED decodeASCII "Use decodeUtf8 instead" #-}

-- | Decode a 'ByteString' containing Latin-1 (aka ISO-8859-1) encoded text.
Expand Down
1 change: 1 addition & 0 deletions text.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ flag developer

library
c-sources: cbits/cbits.c
cbits/is_ascii.c
cbits/measure_off.c
cbits/reverse.c
cbits/utils.c
Expand Down

0 comments on commit b1226ff

Please sign in to comment.