From 5620537c4a6950c3fd7485a68680296060bbff8b Mon Sep 17 00:00:00 2001
From: Andrew Martin <andrew.thaddeus@gmail.com>
Date: Tue, 27 Dec 2022 10:10:54 -0500
Subject: [PATCH] Expose UTF-8 validation functions from internal module

This makes it possible for users to validate that a ByteArray is
a well formed UTF-8 sequence. This works both with and without the
SIMDUTF flag, falling back to a pure-Haskell implementation when
SIMDUTF is off.

Pick up a dependency on data-array-byte to be able to refer to
the lifted ByteArray type on older versions of GHC. Use data-array-byte
dep for emulated workflow.
---
 .github/workflows/emulated.yml          |   5 +-
 cbits/validate_utf8.cpp                 |   5 +
 src/Data/Text/Encoding.hs               |  42 +------
 src/Data/Text/Internal/PrimCompat.hs    |   8 +-
 src/Data/Text/Internal/Validate.hs      | 149 ++++++++++++++++++++++++
 src/Data/Text/Internal/Validate/Simd.hs |  43 +++++++
 text.cabal                              |   6 +
 7 files changed, 212 insertions(+), 46 deletions(-)
 create mode 100644 src/Data/Text/Internal/Validate.hs
 create mode 100644 src/Data/Text/Internal/Validate/Simd.hs

diff --git a/.github/workflows/emulated.yml b/.github/workflows/emulated.yml
index 29ba6253..f9d0386a 100644
--- a/.github/workflows/emulated.yml
+++ b/.github/workflows/emulated.yml
@@ -30,8 +30,9 @@ jobs:
         githubToken: ${{ github.token }}
         install: |
           apt-get update -y
-          apt-get install -y ghc libghc-tasty-quickcheck-dev libghc-tasty-hunit-dev
+          apt-get install -y curl ghc libghc-tasty-quickcheck-dev libghc-tasty-hunit-dev
         run: |
+          curl -s https://hackage.haskell.org/package/data-array-byte-0.1/data-array-byte-0.1.tar.gz | tar xz
           ghc --version
-          ghc --make -isrc:tests -o Main cbits/*.c tests/Tests.hs +RTS -s
+          ghc --make -isrc:tests:data-array-byte-0.1 -o Main cbits/*.c tests/Tests.hs +RTS -s
           ./Main +RTS -s
diff --git a/cbits/validate_utf8.cpp b/cbits/validate_utf8.cpp
index 73ab5f25..9fada135 100644
--- a/cbits/validate_utf8.cpp
+++ b/cbits/validate_utf8.cpp
@@ -4,3 +4,8 @@ extern "C"
 int _hs_text_is_valid_utf8(const char* str, size_t len){
   return simdutf::validate_utf8(str, len);
 }
+
+extern "C"
+int _hs_text_is_valid_utf8_offset(const char* str, size_t off, size_t len){
+  return simdutf::validate_utf8(str + off, len);
+}
diff --git a/src/Data/Text/Encoding.hs b/src/Data/Text/Encoding.hs
index 54f16722..cf53b43a 100644
--- a/src/Data/Text/Encoding.hs
+++ b/src/Data/Text/Encoding.hs
@@ -104,12 +104,7 @@ import Data.Text.Internal.ByteStringCompat
 import GHC.Stack (HasCallStack)
 #endif
 
-#ifdef SIMDUTF
-import Foreign.C.Types (CInt(..))
-#elif !MIN_VERSION_bytestring(0,11,2)
-import qualified Data.ByteString.Unsafe as B
-import Data.Text.Internal.Encoding.Utf8 (CodePoint(..))
-#endif
+import Data.Text.Internal.Validate (isValidUtf8ByteString)
 
 -- $strict
 --
@@ -187,32 +182,6 @@ decodeLatin1 bs = withBS bs $ \fp len -> runST $ do
 foreign import ccall unsafe "_hs_text_is_ascii" c_is_ascii
     :: Ptr Word8 -> Ptr Word8 -> IO CSize
 
-isValidBS :: ByteString -> Bool
-#ifdef SIMDUTF
-isValidBS bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
-  unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8 ptr (fromIntegral len)
-#else
-#if MIN_VERSION_bytestring(0,11,2)
-isValidBS = B.isValidUtf8
-#else
-isValidBS bs = start 0
-  where
-    start ix
-      | ix >= B.length bs = True
-      | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st _ -> step (ix + 1) st
-    step ix st
-      | ix >= B.length bs = False
-      -- We do not use decoded code point, so passing a dummy value to save an argument.
-      | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
-        Accept{} -> start (ix + 1)
-        Reject{} -> False
-        Incomplete st' _ -> step (ix + 1) st'
-#endif
-#endif
-
 -- | Decode a 'ByteString' containing UTF-8 encoded text.
 --
 -- Surrogate code points in replacement character returned by 'OnDecodeError'
@@ -223,7 +192,7 @@ decodeUtf8With ::
 #endif
   OnDecodeError -> ByteString -> Text
 decodeUtf8With onErr bs
-  | isValidBS bs =
+  | isValidUtf8ByteString bs =
     let !(SBS.SBS arr) = SBS.toShort bs in
       (Text (A.ByteArray arr) 0 (B.length bs))
   | B.null undecoded = txt
@@ -287,7 +256,7 @@ decodeUtf8With2 onErr bs1@(B.length -> len1) bs2@(B.length -> len2) = runST $ do
             , srcOff < len1 + guessUtf8Boundary
             , dstOff + (len1 + guessUtf8Boundary - srcOff) <= dstLen
             , bs <- B.drop (srcOff - len1) (B.take guessUtf8Boundary bs2)
-            , isValidBS bs = do
+            , isValidUtf8ByteString bs = do
               withBS bs $ \fp _ -> unsafeIOToST $ unsafeWithForeignPtr fp $ \src ->
                 unsafeSTToIO $ A.copyFromPointer dst dstOff src (len1 + guessUtf8Boundary - srcOff)
               inner (len1 + guessUtf8Boundary) (dstOff + (len1 + guessUtf8Boundary - srcOff))
@@ -615,8 +584,3 @@ encodeUtf32BE txt = E.unstream (E.restreamUtf32BE (F.stream txt))
 
 cSizeToInt :: CSize -> Int
 cSizeToInt = fromIntegral
-
-#ifdef SIMDUTF
-foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8
-    :: Ptr Word8 -> CSize -> IO CInt
-#endif
diff --git a/src/Data/Text/Internal/PrimCompat.hs b/src/Data/Text/Internal/PrimCompat.hs
index 2af87c0a..0153c288 100644
--- a/src/Data/Text/Internal/PrimCompat.hs
+++ b/src/Data/Text/Internal/PrimCompat.hs
@@ -13,13 +13,12 @@ module Data.Text.Internal.PrimCompat
   ) where
 
 #if MIN_VERSION_base(4,16,0)
-
-import GHC.Base
-
+import GHC.Exts (wordToWord8#,word8ToWord#,wordToWord16#,word16ToWord#,wordToWord32#,word32ToWord#)
 #else
-
 import GHC.Prim (Word#)
+#endif
 
+#if !(MIN_VERSION_base(4,16,0))
 wordToWord8#,  word8ToWord#  :: Word# -> Word#
 wordToWord16#, word16ToWord# :: Word# -> Word#
 wordToWord32#, word32ToWord# :: Word# -> Word#
@@ -33,5 +32,4 @@ wordToWord32# w = w
 {-# INLINE word16ToWord# #-}
 {-# INLINE wordToWord32# #-}
 {-# INLINE word32ToWord# #-}
-
 #endif
diff --git a/src/Data/Text/Internal/Validate.hs b/src/Data/Text/Internal/Validate.hs
new file mode 100644
index 00000000..6a31147e
--- /dev/null
+++ b/src/Data/Text/Internal/Validate.hs
@@ -0,0 +1,149 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+-- | Test whether or not a sequence of bytes is a valid UTF-8 byte sequence.
+-- In the GHC Haskell ecosystem, there are several representations of byte
+-- sequences. The only one that the stable @text@ API concerns itself with is
+-- 'ByteString'. Part of bytestring-to-text decoding is 'isValidUtf8ByteString',
+-- a high-performance UTF-8 validation routine written in C++ with fallbacks
+-- for various platforms. The C++ code backing this routine is nontrivial,
+-- so in the interest of reuse, this module additionally exports functions
+-- for working with the GC-managed @ByteArray@ type. These @ByteArray@
+-- functions are not used anywhere else in @text@. They are for the benefit
+-- of library and application authors who do not use 'ByteString' but still
+-- need to interoperate with @text@.
+module Data.Text.Internal.Validate
+  (
+  -- * ByteString
+    isValidUtf8ByteString
+  -- * ByteArray
+  --
+  -- | Is the slice of a byte array a valid UTF-8 byte sequence? These
+  -- functions all accept an offset and a length.
+  , isValidUtf8ByteArray
+  , isValidUtf8ByteArrayUnpinned
+  , isValidUtf8ByteArrayPinned
+  ) where
+
+import Data.Array.Byte (ByteArray(ByteArray))
+import GHC.Exts (ByteArray#)
+import Data.ByteString (ByteString)
+import GHC.Exts (isTrue#,isByteArrayPinned#)
+
+#ifdef SIMDUTF
+import Data.Text.Unsafe (unsafeDupablePerformIO)
+import Data.Text.Internal.ByteStringCompat (withBS)
+import Data.Text.Internal.Unsafe (unsafeWithForeignPtr)
+import Data.Text.Internal.Validate.Simd (c_is_valid_utf8_bytearray_safe,c_is_valid_utf8_bytearray_unsafe,c_is_valid_utf8_ptr_unsafe)
+#else
+import Data.Text.Internal.Encoding.Utf8 (CodePoint(..),DecoderResult(..),utf8DecodeStart,utf8DecodeContinue)
+import GHC.Exts (Int(I#),indexWord8Array#)
+import GHC.Word (Word8(W8#))
+import qualified Data.ByteString as B
+#if !MIN_VERSION_bytestring(0,11,2)
+import qualified Data.ByteString.Unsafe as B
+#endif
+#endif
+
+-- | Is the ByteString a valid UTF-8 byte sequence?
+isValidUtf8ByteString :: ByteString -> Bool
+#ifdef SIMDUTF
+isValidUtf8ByteString bs = withBS bs $ \fp len -> unsafeDupablePerformIO $
+  unsafeWithForeignPtr fp $ \ptr -> (/= 0) <$> c_is_valid_utf8_ptr_unsafe ptr (fromIntegral len)
+#else
+#if MIN_VERSION_bytestring(0,11,2)
+isValidUtf8ByteString = B.isValidUtf8
+#else
+isValidUtf8ByteString bs = start 0
+  where
+    start ix
+      | ix >= B.length bs = True
+      | otherwise = case utf8DecodeStart (B.unsafeIndex bs ix) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st _ -> step (ix + 1) st
+    step ix st
+      | ix >= B.length bs = False
+      -- We do not use decoded code point, so passing a dummy value to save an argument.
+      | otherwise = case utf8DecodeContinue (B.unsafeIndex bs ix) st (CodePoint 0) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st' _ -> step (ix + 1) st'
+#endif
+#endif
+
+-- | For pinned byte arrays larger than 128KiB, this switches to the safe FFI
+-- so that it does not prevent GC. This threshold (128KiB) was chosen
+-- somewhat arbitrarily and may change in the future.
+isValidUtf8ByteArray ::
+     ByteArray -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+isValidUtf8ByteArray b@(ByteArray b#) !off !len
+  | len >= 131072 -- 128KiB
+  , isTrue# (isByteArrayPinned# b#)
+  = isValidUtf8ByteArrayPinned b off len
+  | otherwise = isValidUtf8ByteArrayUnpinned b off len
+
+-- | This uses the @unsafe@ FFI. GC waits for all @unsafe@ FFI calls
+-- to complete before starting. Consequently, an @unsafe@ FFI call does not
+-- run concurrently with GC and is not interrupted by GC. Since relocation
+-- cannot happen concurrently with an @unsafe@ FFI call, it is safe
+-- to call this function with an unpinned byte array argument.
+-- It is also safe to call this with a pinned @ByteArray@ argument.
+isValidUtf8ByteArrayUnpinned ::
+     ByteArray -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+#ifdef SIMDUTF
+isValidUtf8ByteArrayUnpinned (ByteArray bs) !off !len =
+  unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_unsafe bs (fromIntegral off) (fromIntegral len)
+#else
+isValidUtf8ByteArrayUnpinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
+#endif
+
+-- | This uses the @safe@ FFI. GC may run concurrently with @safe@
+-- FFI calls. Consequently, unpinned objects may be relocated while a
+-- @safe@ FFI call is executing. The byte array argument /must/ be pinned,
+-- and the calling context is responsible for enforcing this. If the
+-- byte array is not pinned, this function's behavior is undefined.
+isValidUtf8ByteArrayPinned ::
+     ByteArray -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+#ifdef SIMDUTF
+isValidUtf8ByteArrayPinned (ByteArray bs) !off !len =
+  unsafeDupablePerformIO $ (/= 0) <$> c_is_valid_utf8_bytearray_safe bs (fromIntegral off) (fromIntegral len)
+#else
+isValidUtf8ByteArrayPinned (ByteArray bs) = isValidUtf8ByteArrayHaskell# bs
+#endif
+
+#ifndef SIMDUTF
+isValidUtf8ByteArrayHaskell# ::
+     ByteArray# -- ^ Bytes
+  -> Int -- ^ Offset
+  -> Int -- ^ Length
+  -> Bool
+isValidUtf8ByteArrayHaskell# b !off !len = start off
+  where
+    indexWord8 :: ByteArray# -> Int -> Word8
+    indexWord8 !x (I# i) = W8# (indexWord8Array# x i)
+    start ix
+      | ix >= len = True
+      | otherwise = case utf8DecodeStart (indexWord8 b ix) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st _ -> step (ix + 1) st
+    step ix st
+      | ix >= len = False
+      -- We do not use decoded code point, so passing a dummy value to save an argument.
+      | otherwise = case utf8DecodeContinue (indexWord8 b ix) st (CodePoint 0) of
+        Accept{} -> start (ix + 1)
+        Reject{} -> False
+        Incomplete st' _ -> step (ix + 1) st'
+#endif
diff --git a/src/Data/Text/Internal/Validate/Simd.hs b/src/Data/Text/Internal/Validate/Simd.hs
new file mode 100644
index 00000000..e095049b
--- /dev/null
+++ b/src/Data/Text/Internal/Validate/Simd.hs
@@ -0,0 +1,43 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE CPP #-}
+{-# LANGUAGE MagicHash #-}
+{-# LANGUAGE UnliftedFFITypes #-}
+
+-- | Validate that a byte sequence is UTF-8-encoded text. All of these
+-- functions return zero when the byte sequence is not UTF-8-encoded text,
+-- and they return an unspecified non-zero value when the byte sequence
+-- is UTF-8-encoded text.
+--
+-- Variants are provided for both @ByteArray#@ and @Ptr@. Additionally,
+-- variants are provided that use both the @safe@ and @unsafe@ FFI.
+--
+-- If compiling with SIMDUTF turned off, this module exports nothing.
+module Data.Text.Internal.Validate.Simd
+  ( c_is_valid_utf8_ptr_unsafe
+  , c_is_valid_utf8_ptr_safe
+  , c_is_valid_utf8_bytearray_unsafe
+  , c_is_valid_utf8_bytearray_safe
+  ) where
+
+import Data.Word (Word8)
+import Foreign.C.Types (CSize(..),CInt(..))
+import GHC.Exts (Ptr,ByteArray#)
+
+foreign import ccall unsafe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_unsafe
+    :: Ptr Word8 -- ^ Bytes
+    -> CSize -- ^ Length
+    -> IO CInt
+foreign import ccall safe "_hs_text_is_valid_utf8" c_is_valid_utf8_ptr_safe
+    :: Ptr Word8 -- ^ Bytes
+    -> CSize -- ^ Length
+    -> IO CInt
+foreign import ccall unsafe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_unsafe
+    :: ByteArray# -- ^ Bytes
+    -> CSize -- ^ Offset into bytes
+    -> CSize -- ^ Length
+    -> IO CInt
+foreign import ccall safe "_hs_text_is_valid_utf8_offset" c_is_valid_utf8_bytearray_safe
+    :: ByteArray# -- ^ Bytes
+    -> CSize -- ^ Offset into bytes
+    -> CSize -- ^ Length
+    -> IO CInt
diff --git a/text.cabal b/text.cabal
index d3176579..904e237c 100644
--- a/text.cabal
+++ b/text.cabal
@@ -85,6 +85,7 @@ library
   hs-source-dirs: src
 
   if flag(simdutf)
+    exposed-modules: Data.Text.Internal.Validate.Simd
     include-dirs: simdutf
     cxx-sources: simdutf/simdutf.cpp
                  cbits/validate_utf8.cpp
@@ -136,6 +137,10 @@ library
   if flag(simdutf) && os(netbsd) && impl(ghc < 9.4)
     build-depends: base < 0
 
+  -- Before GHC 9.4, Data.Array.Byte is not in base
+  if impl(ghc < 9.4)
+    build-depends: data-array-byte >= 0.1.0.1
+
   exposed-modules:
     Data.Text
     Data.Text.Array
@@ -170,6 +175,7 @@ library
     Data.Text.Internal.Search
     Data.Text.Internal.Unsafe
     Data.Text.Internal.Unsafe.Char
+    Data.Text.Internal.Validate
     Data.Text.Lazy
     Data.Text.Lazy.Builder
     Data.Text.Lazy.Builder.Int