From 5620537c4a6950c3fd7485a68680296060bbff8b Mon Sep 17 00:00:00 2001 From: Andrew Martin Date: Tue, 27 Dec 2022 10:10:54 -0500 Subject: [PATCH] Expose UTF-8 validation functions from internal module This makes it possible for users to validate that a ByteArray is a well formed UTF-8 sequence. This works both with and without the SIMDUTF flag, falling back to a pure-Haskell implementation when SIMDUTF is off. Pick up a dependency on data-array-byte to be able to refer to the lifted ByteArray type on older versions of GHC. Use data-array-byte dep for emulated workflow. --- .github/workflows/emulated.yml | 5 +- cbits/validate_utf8.cpp | 5 + src/Data/Text/Encoding.hs | 42 +------ src/Data/Text/Internal/PrimCompat.hs | 8 +- src/Data/Text/Internal/Validate.hs | 149 ++++++++++++++++++++++++ src/Data/Text/Internal/Validate/Simd.hs | 43 +++++++ text.cabal | 6 + 7 files changed, 212 insertions(+), 46 deletions(-) create mode 100644 src/Data/Text/Internal/Validate.hs create mode 100644 src/Data/Text/Internal/Validate/Simd.hs diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml index 29ba6253..f9d0386a 100644 --- a/.github/workflows/emulated.yml +++ b/.github/workflows/emulated.yml @@ -30,8 +30,9 @@ jobs: githubToken: ${{ github.token }} install: | apt-get update -y - apt-get install -y ghc libghc-tasty-quickcheck-dev libghc-tasty-hunit-dev + apt-get install -y curl ghc libghc-tasty-quickcheck-dev libghc-tasty-hunit-dev run: | + curl -s https://hackage.haskell.org/package/data-array-byte-0.1/data-array-byte-0.1.tar.gz | tar xz ghc --version - ghc --make -isrc:tests -o Main cbits/*.c tests/Tests.hs +RTS -s + ghc --make -isrc:tests:data-array-byte-0.1 -o Main cbits/*.c tests/Tests.hs +RTS -s ./Main +RTS -s diff --git a/cbits/validate_utf8.cpp b/cbits/validate_utf8.cpp index 73ab5f25..9fada135 100644 --- a/cbits/validate_utf8.cpp +++ b/cbits/validate_utf8.cpp @@ -4,3 +4,8 @@ extern "C" int _hs_text_is_valid_utf8(const char* str, size_t len){ return simdutf::validate_utf8(str, len); } + +extern "C" +int _hs_text_is_valid_utf8_offset(const char* str, size_t off, size_t len){ + return simdutf::validate_utf8(str + off, len); +} diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs index 54f16722..cf53b43a 100644 --- a/src/Data/Text/Encoding.hs +++ b/src/Data/Text/Encoding.hs @@ -104,12 +104,7 @@ import Data.Text.Internal.ByteStringCompat import GHC.Stack (HasCallStack) #endif -#ifdef SIMDUTF -import Foreign.C.Types (CInt(..)) -#elif !MIN_VERSION_bytestring(0,11,2) -import qualified Data.ByteString.Unsafe as B -import Data.Text.Internal.Encoding.Utf8 (CodePoint(..)) -#endif +import Data.Text.Internal.Validate (isValidUtf8ByteString) -- $strict -- @@ -187,32 +182,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii :: Ptr Word8 -> Ptr Word8 -> IO CSize -isValidBS :: ByteString -> Bool -#ifdef SIMDUTF -isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len) -#else -#if MIN_VERSION_bytestring(0,11,2) -isValidBS = B.isValidUtf8 -#else -isValidBS bs = start 0 - where - start ix - | ix >= B.length bs = True - | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st _ -> step (ix + 1) st - step ix st - | ix >= B.length bs = False - -- We do not use decoded code point, so passing a dummy value to save an argument. - | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of - Accept{} -> start (ix + 1) - Reject{} -> False - Incomplete st' _ -> step (ix + 1) st' -#endif -#endif - -- | Decode a 'ByteString' containing UTF-8 encoded text. -- -- Surrogate code points in replacement character returned by 'OnDecodeError' @@ -223,7 +192,7 @@ decodeUtf8With :: #endif OnDecodeError -> ByteString -> Text decodeUtf8With onErr bs - | isValidBS bs = + | isValidUtf8ByteString bs = let !(SBS.SBS arr) = SBS.toShort bs in (Text (A.ByteArray arr) 0 (B.length bs)) | B.null undecoded = txt @@ -287,7 +256,7 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do , srcOff < len1 + guessUtf8Boundary , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2) - , isValidBS bs = do + , isValidUtf8ByteString bs = do withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src -> unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff) inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff)) @@ -615,8 +584,3 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt)) cSizeToInt :: CSize -> Int cSizeToInt = fromIntegral - -#ifdef SIMDUTF -foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8 - :: Ptr Word8 -> CSize -> IO CInt -#endif diff --git a/src/Data/Text/Internal/PrimCompat.hs b/src/Data/Text/Internal/PrimCompat.hs index 2af87c0a..0153c288 100644 --- a/src/Data/Text/Internal/PrimCompat.hs +++ b/src/Data/Text/Internal/PrimCompat.hs @@ -13,13 +13,12 @@ module Data.Text.Internal.PrimCompat ) where #if MIN_VERSION_base(4,16,0) - -import GHC.Base - +import GHC.Exts (wordToWord8#,word8ToWord#,wordToWord16#,word16ToWord#,wordToWord32#,word32ToWord#) #else - import GHC.Prim (Word#) +#endif +#if !(MIN_VERSION_base(4,16,0)) wordToWord8#, word8ToWord# :: Word# -> Word# wordToWord16#, word16ToWord# :: Word# -> Word# wordToWord32#, word32ToWord# :: Word# -> Word# @@ -33,5 +32,4 @@ wordToWord32# w = w {-# INLINE word16ToWord# #-} {-# INLINE wordToWord32# #-} {-# INLINE word32ToWord# #-} - #endif diff --git a/src/Data/Text/Internal/Validate.hs b/src/Data/Text/Internal/Validate.hs new file mode 100644 index 00000000..6a31147e --- /dev/null +++ b/src/Data/Text/Internal/Validate.hs @@ -0,0 +1,149 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnliftedFFITypes #-} + +-- | Test whether or not a sequence of bytes is a valid UTF-8 byte sequence. +-- In the GHC Haskell ecosystem, there are several representations of byte +-- sequences. The only one that the stable @text@ API concerns itself with is +-- 'ByteString'. Part of bytestring-to-text decoding is 'isValidUtf8ByteString', +-- a high-performance UTF-8 validation routine written in C++ with fallbacks +-- for various platforms. The C++ code backing this routine is nontrivial, +-- so in the interest of reuse, this module additionally exports functions +-- for working with the GC-managed @ByteArray@ type. These @ByteArray@ +-- functions are not used anywhere else in @text@. They are for the benefit +-- of library and application authors who do not use 'ByteString' but still +-- need to interoperate with @text@. +module Data.Text.Internal.Validate + ( + -- * ByteString + isValidUtf8ByteString + -- * ByteArray + -- + -- | Is the slice of a byte array a valid UTF-8 byte sequence? These + -- functions all accept an offset and a length. + , isValidUtf8ByteArray + , isValidUtf8ByteArrayUnpinned + , isValidUtf8ByteArrayPinned + ) where + +import Data.Array.Byte (ByteArray(ByteArray)) +import GHC.Exts (ByteArray#) +import Data.ByteString (ByteString) +import GHC.Exts (isTrue#,isByteArrayPinned#) + +#ifdef SIMDUTF +import Data.Text.Unsafe (unsafeDupablePerformIO) +import Data.Text.Internal.ByteStringCompat (withBS) +import Data.Text.Internal.Unsafe (unsafeWithForeignPtr) +import Data.Text.Internal.Validate.Simd (c_is_valid_utf8_bytearray_safe,c_is_valid_utf8_bytearray_unsafe,c_is_valid_utf8_ptr_unsafe) +#else +import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue) +import GHC.Exts (Int(I#),indexWord8Array#) +import GHC.Word (Word8(W8#)) +import qualified Data.ByteString as B +#if !MIN_VERSION_bytestring(0,11,2) +import qualified Data.ByteString.Unsafe as B +#endif +#endif + +-- | Is the ByteString a valid UTF-8 byte sequence? +isValidUtf8ByteString :: ByteString -> Bool +#ifdef SIMDUTF +isValidUtf8ByteString bs = withBS bs $ \fp len -> unsafeDupablePerformIO $ + unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8_ptr_unsafe ptr (fromIntegral len) +#else +#if MIN_VERSION_bytestring(0,11,2) +isValidUtf8ByteString = B.isValidUtf8 +#else +isValidUtf8ByteString bs = start 0 + where + start ix + | ix >= B.length bs = True + | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st _ -> step (ix + 1) st + step ix st + | ix >= B.length bs = False + -- We do not use decoded code point, so passing a dummy value to save an argument. + | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st' _ -> step (ix + 1) st' +#endif +#endif + +-- | For pinned byte arrays larger than 128KiB, this switches to the safe FFI +-- so that it does not prevent GC. This threshold (128KiB) was chosen +-- somewhat arbitrarily and may change in the future. +isValidUtf8ByteArray :: + ByteArray -- ^ Bytes + -> Int -- ^ Offset + -> Int -- ^ Length + -> Bool +isValidUtf8ByteArray b@(ByteArray b#) !off !len + | len >= 131072 -- 128KiB + , isTrue# (isByteArrayPinned# b#) + = isValidUtf8ByteArrayPinned b off len + | otherwise = isValidUtf8ByteArrayUnpinned b off len + +-- | This uses the @unsafe@ FFI. GC waits for all @unsafe@ FFI calls +-- to complete before starting. Consequently, an @unsafe@ FFI call does not +-- run concurrently with GC and is not interrupted by GC. Since relocation +-- cannot happen concurrently with an @unsafe@ FFI call, it is safe +-- to call this function with an unpinned byte array argument. +-- It is also safe to call this with a pinned @ByteArray@ argument. +isValidUtf8ByteArrayUnpinned :: + ByteArray -- ^ Bytes + -> Int -- ^ Offset + -> Int -- ^ Length + -> Bool +#ifdef SIMDUTF +isValidUtf8ByteArrayUnpinned (ByteArray bs) !off !len = + unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_unsafe bs (fromIntegral off) (fromIntegral len) +#else +isValidUtf8ByteArrayUnpinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs +#endif + +-- | This uses the @safe@ FFI. GC may run concurrently with @safe@ +-- FFI calls. Consequently, unpinned objects may be relocated while a +-- @safe@ FFI call is executing. The byte array argument /must/ be pinned, +-- and the calling context is responsible for enforcing this. If the +-- byte array is not pinned, this function's behavior is undefined. +isValidUtf8ByteArrayPinned :: + ByteArray -- ^ Bytes + -> Int -- ^ Offset + -> Int -- ^ Length + -> Bool +#ifdef SIMDUTF +isValidUtf8ByteArrayPinned (ByteArray bs) !off !len = + unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_safe bs (fromIntegral off) (fromIntegral len) +#else +isValidUtf8ByteArrayPinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs +#endif + +#ifndef SIMDUTF +isValidUtf8ByteArrayHaskell# :: + ByteArray# -- ^ Bytes + -> Int -- ^ Offset + -> Int -- ^ Length + -> Bool +isValidUtf8ByteArrayHaskell# b !off !len = start off + where + indexWord8 :: ByteArray# -> Int -> Word8 + indexWord8 !x (I# i) = W8# (indexWord8Array# x i) + start ix + | ix >= len = True + | otherwise = case utf8DecodeStart (indexWord8 b ix) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st _ -> step (ix + 1) st + step ix st + | ix >= len = False + -- We do not use decoded code point, so passing a dummy value to save an argument. + | otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of + Accept{} -> start (ix + 1) + Reject{} -> False + Incomplete st' _ -> step (ix + 1) st' +#endif diff --git a/src/Data/Text/Internal/Validate/Simd.hs b/src/Data/Text/Internal/Validate/Simd.hs new file mode 100644 index 00000000..e095049b --- /dev/null +++ b/src/Data/Text/Internal/Validate/Simd.hs @@ -0,0 +1,43 @@ +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE CPP #-} +{-# LANGUAGE MagicHash #-} +{-# LANGUAGE UnliftedFFITypes #-} + +-- | Validate that a byte sequence is UTF-8-encoded text. All of these +-- functions return zero when the byte sequence is not UTF-8-encoded text, +-- and they return an unspecified non-zero value when the byte sequence +-- is UTF-8-encoded text. +-- +-- Variants are provided for both @ByteArray#@ and @Ptr@. Additionally, +-- variants are provided that use both the @safe@ and @unsafe@ FFI. +-- +-- If compiling with SIMDUTF turned off, this module exports nothing. +module Data.Text.Internal.Validate.Simd + ( c_is_valid_utf8_ptr_unsafe + , c_is_valid_utf8_ptr_safe + , c_is_valid_utf8_bytearray_unsafe + , c_is_valid_utf8_bytearray_safe + ) where + +import Data.Word (Word8) +import Foreign.C.Types (CSize(..),CInt(..)) +import GHC.Exts (Ptr,ByteArray#) + +foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_unsafe + :: Ptr Word8 -- ^ Bytes + -> CSize -- ^ Length + -> IO CInt +foreign import ccall safe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_safe + :: Ptr Word8 -- ^ Bytes + -> CSize -- ^ Length + -> IO CInt +foreign import ccall unsafe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_unsafe + :: ByteArray# -- ^ Bytes + -> CSize -- ^ Offset into bytes + -> CSize -- ^ Length + -> IO CInt +foreign import ccall safe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_safe + :: ByteArray# -- ^ Bytes + -> CSize -- ^ Offset into bytes + -> CSize -- ^ Length + -> IO CInt diff --git a/text.cabal b/text.cabal index d3176579..904e237c 100644 --- a/text.cabal +++ b/text.cabal @@ -85,6 +85,7 @@ library hs-source-dirs: src if flag(simdutf) + exposed-modules: Data.Text.Internal.Validate.Simd include-dirs: simdutf cxx-sources: simdutf/simdutf.cpp cbits/validate_utf8.cpp @@ -136,6 +137,10 @@ library if flag(simdutf) && os(netbsd) && impl(ghc < 9.4) build-depends: base < 0 + -- Before GHC 9.4, Data.Array.Byte is not in base + if impl(ghc < 9.4) + build-depends: data-array-byte >= 0.1.0.1 + exposed-modules: Data.Text Data.Text.Array @@ -170,6 +175,7 @@ library Data.Text.Internal.Search Data.Text.Internal.Unsafe Data.Text.Internal.Unsafe.Char + Data.Text.Internal.Validate Data.Text.Lazy Data.Text.Lazy.Builder Data.Text.Lazy.Builder.Int