Expose UTF-8 validation functions from internal module

This makes it possible for users to validate that a ByteArray is a well formed UTF-8 sequence. This works both with and without the SIMDUTF flag, falling back to a pure-Haskell implementation when SIMDUTF is off. Pick up a dependency on data-array-byte to be able to refer to the lifted ByteArray type on older versions of GHC. Use data-array-byte dep for emulated workflow.
haskell · Apr 18, 2023 · 8af5f44 · 8af5f44
1 parent 3f26df8
commit 8af5f44
Show file tree

Hide file tree

Showing 5 changed files with 206 additions and 5 deletions.
diff --git a/cbits/validate_utf8.cpp b/cbits/validate_utf8.cpp
@@ -4,3 +4,8 @@ extern "C"
 int _hs_text_is_valid_utf8(const char* str, size_t len){
   return simdutf::validate_utf8(str, len);
 }
+
+extern "C"
+int _hs_text_is_valid_utf8_offset(const char* str, size_t off, size_t len){
+  return simdutf::validate_utf8(str + off, len);
+}
diff --git a/src/Data/Text/Internal/PrimCompat.hs b/src/Data/Text/Internal/PrimCompat.hs
@@ -13,13 +13,12 @@ module Data.Text.Internal.PrimCompat
   ) where
 
 #if MIN_VERSION_base(4,16,0)
-
-import GHC.Base
-
+import GHC.Exts (wordToWord8#,word8ToWord#,wordToWord16#,word16ToWord#,wordToWord32#,word32ToWord#)
 #else
-
 import GHC.Prim (Word#)
+#endif
 
+#if !(MIN_VERSION_base(4,16,0))
 wordToWord8#,  word8ToWord#  :: Word# -> Word#
 wordToWord16#, word16ToWord# :: Word# -> Word#
 wordToWord32#, word32ToWord# :: Word# -> Word#
@@ -33,5 +32,4 @@ wordToWord32# w = w
 {-# INLINE word16ToWord# #-}
 {-# INLINE wordToWord32# #-}
 {-# INLINE word32ToWord# #-}
-
 #endif
diff --git a/src/Data/Text/Internal/Validate.hs b/src/Data/Text/Internal/Validate.hs
@@ -0,0 +1,149 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+-- | Test whether or not a sequence of bytes is a valid UTF-8 byte sequence.
+-- In the GHC Haskell ecosystem, there are several representations of byte
+-- sequences. The only one that the stable @text@ API concerns itself with is
+-- 'ByteString'. Part of bytestring-to-text decoding is 'isValidUtf8ByteString',
+-- a high-performance UTF-8 validation routine written in C++ with fallbacks
+-- for various platforms. The C++ code backing this routine is nontrivial,
+-- so in the interest of reuse, this module additionally exports functions
+-- for working with the GC-managed @ByteArray@ type. These @ByteArray@
+-- functions are not used anywhere else in @text@. They are for the benefit
+-- of library and application authors who do not use 'ByteString' but still
+-- need to interoperate with @text@.
+module Data.Text.Internal.Validate
+  (
+  -- * ByteString
+    isValidUtf8ByteString
+  -- * ByteArray
+  --
+  -- | Is the slice of a byte array a valid UTF-8 byte sequence? These
+  -- functions all accept an offset and a length.
+  , isValidUtf8ByteArray
+  , isValidUtf8ByteArrayUnpinned
+  , isValidUtf8ByteArrayPinned
+  ) where
+
+import Data.Array.Byte (ByteArray(ByteArray))
+import GHC.Exts (ByteArray#)
+import Data.ByteString (ByteString)
+import GHC.Exts (isTrue#,isByteArrayPinned#)
+
+#ifdef SIMDUTF
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Data.Text.Internal.ByteStringCompat (withBS)
+import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
+import Data.Text.Internal.Validate.Simd (c_is_valid_utf8_bytearray_safe,c_is_valid_utf8_bytearray_unsafe,c_is_valid_utf8_ptr_unsafe)
+#else
+import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue)
+import GHC.Exts (Int(I#),indexWord8Array#)
+import GHC.Word (Word8(W8#))
+import qualified Data.ByteString as B
+#if !MIN_VERSION_bytestring(0,11,2)
+import qualified Data.ByteString.Unsafe as B
+#endif
+#endif
+
+-- | Is the ByteString a valid UTF-8 byte sequence?
+isValidUtf8ByteString :: ByteString -> Bool
+#ifdef SIMDUTF
+isValidUtf8ByteString bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
+  unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8_ptr_unsafe ptr (fromIntegral len)
+#else
+#if MIN_VERSION_bytestring(0,11,2)
+isValidUtf8ByteString = B.isValidUtf8
+#else
+isValidUtf8ByteString bs = start 0
+  where
+    start ix
+      | ix >= B.length bs = True
+      | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st _ -> step (ix + 1) st
+    step ix st
+      | ix >= B.length bs = False
+      -- We do not use decoded code point, so passing a dummy value to save an argument.
+      | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st' _ -> step (ix + 1) st'
+#endif
+#endif
+
+-- | For pinned byte arrays larger than 128KiB, this switches to the safe FFI
+-- so that it does not prevent GC. This threshold (128KiB) was chosen
+-- somewhat arbitrarily and may change in the future.
+isValidUtf8ByteArray ::
+     ByteArray -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+isValidUtf8ByteArray b@(ByteArray b#) !off !len
+  | len >= 131072 -- 128KiB
+  , isTrue# (isByteArrayPinned# b#)
+  = isValidUtf8ByteArrayPinned b off len
+  | otherwise = isValidUtf8ByteArrayUnpinned b off len
+
+-- | This uses the @unsafe@ FFI. GC waits for all @unsafe@ FFI calls
+-- to complete before starting. Consequently, an @unsafe@ FFI call does not
+-- run concurrently with GC and is not interrupted by GC. Since relocation
+-- cannot happen concurrently with an @unsafe@ FFI call, it is safe
+-- to call this function with an unpinned byte array argument.
+-- It is also safe to call this with a pinned @ByteArray@ argument.
+isValidUtf8ByteArrayUnpinned ::
+     ByteArray -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+#ifdef SIMDUTF
+isValidUtf8ByteArrayUnpinned (ByteArray bs) !off !len =
+  unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_unsafe bs (fromIntegral off) (fromIntegral len)
+#else
+isValidUtf8ByteArrayUnpinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
+#endif
+
+-- | This uses the @safe@ FFI. GC may run concurrently with @safe@
+-- FFI calls. Consequently, unpinned objects may be relocated while a
+-- @safe@ FFI call is executing. The byte array argument /must/ be pinned,
+-- and the calling context is responsible for enforcing this. If the
+-- byte array is not pinned, this function's behavior is undefined.
+isValidUtf8ByteArrayPinned ::
+     ByteArray -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+#ifdef SIMDUTF
+isValidUtf8ByteArrayPinned (ByteArray bs) !off !len =
+  unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_safe bs (fromIntegral off) (fromIntegral len)
+#else
+isValidUtf8ByteArrayPinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
+#endif
+
+#ifndef SIMDUTF
+isValidUtf8ByteArrayHaskell# ::
+     ByteArray# -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+isValidUtf8ByteArrayHaskell# b !off !len = start off
+  where
+    indexWord8 :: ByteArray# -> Int -> Word8
+    indexWord8 !x (I# i) = W8# (indexWord8Array# x i)
+    start ix
+      | ix >= len = True
+      | otherwise = case utf8DecodeStart (indexWord8 b ix) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st _ -> step (ix + 1) st
+    step ix st
+      | ix >= len = False
+      -- We do not use decoded code point, so passing a dummy value to save an argument.
+      | otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st' _ -> step (ix + 1) st'
+#endif
diff --git a/src/Data/Text/Internal/Validate/Simd.hs b/src/Data/Text/Internal/Validate/Simd.hs
@@ -0,0 +1,43 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+-- | Validate that a byte sequence is UTF-8-encoded text. All of these
+-- functions return zero when the byte sequence is not UTF-8-encoded text,
+-- and they return an unspecified non-zero value when the byte sequence
+-- is UTF-8-encoded text.
+--
+-- Variants are provided for both @ByteArray#@ and @Ptr@. Additionally,
+-- variants are provided that use both the @safe@ and @unsafe@ FFI.
+--
+-- If compiling with SIMDUTF turned off, this module exports nothing.
+module Data.Text.Internal.Validate.Simd
+  ( c_is_valid_utf8_ptr_unsafe
+  , c_is_valid_utf8_ptr_safe
+  , c_is_valid_utf8_bytearray_unsafe
+  , c_is_valid_utf8_bytearray_safe
+  ) where
+
+import Data.Word (Word8)
+import Foreign.C.Types (CSize(..),CInt(..))
+import GHC.Exts (Ptr,ByteArray#)
+
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_unsafe
+    :: Ptr Word8 -- ^ Bytes
+    -> CSize -- ^ Length
+    -> IO CInt
+foreign import ccall safe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_safe
+    :: Ptr Word8 -- ^ Bytes
+    -> CSize -- ^ Length
+    -> IO CInt
+foreign import ccall unsafe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_unsafe
+    :: ByteArray# -- ^ Bytes
+    -> CSize -- ^ Offset into bytes
+    -> CSize -- ^ Length
+    -> IO CInt
+foreign import ccall safe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_safe
+    :: ByteArray# -- ^ Bytes
+    -> CSize -- ^ Offset into bytes
+    -> CSize -- ^ Length
+    -> IO CInt
diff --git a/text.cabal b/text.cabal
@@ -85,6 +85,7 @@ library
   hs-source-dirs: src
 
   if flag(simdutf)
+    exposed-modules: Data.Text.Internal.Validate.Simd
     include-dirs: simdutf
     cxx-sources: simdutf/simdutf.cpp
                  cbits/validate_utf8.cpp
@@ -136,6 +137,10 @@ library
   if flag(simdutf) && os(netbsd) && impl(ghc < 9.4)
     build-depends: base < 0
 
+  -- Before GHC 9.4, Data.Array.Byte is not in base
+  if impl(ghc < 9.4)
+    build-depends: data-array-byte >= 0.1.0.1
+
   exposed-modules:
     Data.Text
     Data.Text.Array
@@ -173,6 +178,7 @@ library
     Data.Text.Internal.StrictBuilder
     Data.Text.Internal.Unsafe
     Data.Text.Internal.Unsafe.Char
+    Data.Text.Internal.Validate
     Data.Text.Lazy
     Data.Text.Lazy.Builder
     Data.Text.Lazy.Builder.Int