Avoid per-byte loop in cstring{,Utf8} builders

Copy chunks of the input to the output buffer with 'memcpy', up to the shorter of the available buffer space and the "null-free" portion of the remaining string. For the UTF8 version, encoded NUL bytes are located via strstr(3).
haskell · Feb 9, 2023 · 0645428 · 0645428
1 parent 0bd68ca
commit 0645428
Show file tree

Hide file tree

Showing 8 changed files with 164 additions and 74 deletions.
diff --git a/Data/ByteString/Builder.hs b/Data/ByteString/Builder.hs
@@ -263,12 +263,14 @@ import qualified Data.ByteString.Builder.Prim  as P
 import qualified Data.ByteString.Lazy.Internal as L
 import           Data.ByteString.Builder.ASCII
 import           Data.ByteString.Builder.RealFloat
+import           Data.ByteString.Internal (byteCountLiteral)
 
 import           Data.String (IsString(..))
 import           System.IO (Handle, IOMode(..), withBinaryFile)
 import           Foreign
 import           GHC.Base (unpackCString#, unpackCStringUtf8#,
                            unpackFoldrCString#, build)
+import           GHC.Ptr (Ptr(..))
 
 -- | Execute a 'Builder' and return the generated chunks as a lazy 'L.ByteString'.
 -- The work is performed lazy, i.e., only when a chunk of the lazy 'L.ByteString'
@@ -440,18 +442,20 @@ char8 :: Char -> Builder
 char8 = P.primFixed P.char8
 
 -- | Char8 encode a 'String'.
-{-# INLINE [1] string8 #-} -- phased to allow P.cstring rewrite
+{-# NOINLINE string8 #-}
 string8 :: String -> Builder
 string8 = P.primMapListFixed P.char8
 
 -- GHC desugars string literals with unpackCString# which the simplifier tends
 -- to promptly turn into build (unpackFoldrCString# s), so we match on both.
 {-# RULES
 "string8/unpackCString#" forall s.
-  string8 (unpackCString# s) = P.cstring s
+  string8 (unpackCString# s) =
+    ascLiteralCopy (Ptr s) (byteCountLiteral s)
 
 "string8/unpackFoldrCString#" forall s.
-  string8 (build (unpackFoldrCString# s)) = P.cstring s
+  string8 (build (unpackFoldrCString# s)) =
+    ascLiteralCopy (Ptr s) (byteCountLiteral s)
  #-}
 
 ------------------------------------------------------------------------------
@@ -467,19 +471,22 @@ charUtf8 = P.primBounded P.charUtf8
 --
 -- Note that 'stringUtf8' performs no codepoint validation and consequently may
 -- emit invalid UTF-8 if asked (e.g. single surrogates).
-{-# INLINE [1] stringUtf8 #-} -- phased to allow P.cstring rewrite
+{-# NOINLINE stringUtf8 #-}
 stringUtf8 :: String -> Builder
 stringUtf8 = P.primMapListBounded P.charUtf8
 
 {-# RULES
 "stringUtf8/unpackCStringUtf8#" forall s.
-  stringUtf8 (unpackCStringUtf8# s) = P.cstringUtf8 s
+  stringUtf8 (unpackCStringUtf8# s) =
+    modUtf8LitCopy (Ptr s) (byteCountLiteral s)
 
 "stringUtf8/unpackCString#" forall s.
-  stringUtf8 (unpackCString# s) = P.cstring s
+  stringUtf8 (unpackCString# s) =
+    ascLiteralCopy (Ptr s) (byteCountLiteral s)
 
 "stringUtf8/unpackFoldrCString#" forall s.
-  stringUtf8 (build (unpackFoldrCString# s)) = P.cstring s
+  stringUtf8 (build (unpackFoldrCString# s)) =
+    ascLiteralCopy (Ptr s) (byteCountLiteral s)
  #-}
 
 instance IsString Builder where

diff --git a/Data/ByteString/Builder/Internal.hs b/Data/ByteString/Builder/Internal.hs
@@ -1,5 +1,5 @@
 {-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns, RankNTypes, TupleSections #-}
-{-# LANGUAGE Unsafe #-}
+{-# LANGUAGE MagicHash, ViewPatterns, Unsafe #-}
 {-# OPTIONS_HADDOCK not-home #-}
 -- | Copyright : (c) 2010 - 2011 Simon Meier
 -- License     : BSD3-style (see LICENSE)
@@ -84,6 +84,8 @@ module Data.ByteString.Builder.Internal (
   -- , sizedChunksInsert
 
   , byteStringCopy
+  , ascLiteralCopy
+  , modUtf8LitCopy
   , byteStringInsert
   , byteStringThreshold
 
@@ -127,6 +129,7 @@ module Data.ByteString.Builder.Internal (
 ) where
 
 import           Control.Arrow (second)
+import           Control.Monad (when)
 
 #if !(MIN_VERSION_base(4,11,0))
 import           Data.Semigroup (Semigroup((<>)))
@@ -140,10 +143,12 @@ import qualified Data.ByteString.Short.Internal as Sh
 import qualified GHC.IO.Buffer as IO (Buffer(..), newByteBuffer)
 import           GHC.IO.Handle.Internals (wantWritableHandle, flushWriteBuffer)
 import           GHC.IO.Handle.Types (Handle__, haByteBuffer, haBufferMode)
+import           GHC.Ptr (Ptr(..))
 import           System.IO (hFlush, BufferMode(..), Handle)
 import           Data.IORef
 
 import           Foreign
+import           Foreign.C.String (CString)
 import           Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
 import           System.IO.Unsafe (unsafeDupablePerformIO)
 
@@ -857,6 +862,75 @@ byteStringInsert :: S.ByteString -> Builder
 byteStringInsert =
     \bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k
 
+
+------------------------------------------------------------------------------
+-- Raw CString encoding
+------------------------------------------------------------------------------
+
+-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
+-- strings that are free of embedded (overlong-encoded as the two-byte sequence
+-- @0xC0 0x80@) null characters.
+--
+-- @since 0.11.5.0
+{-# INLINABLE ascLiteralCopy #-}
+ascLiteralCopy :: Ptr Word8 -> Int -> Builder
+ascLiteralCopy = \ !ip !len -> builder $ \k br -> do
+    let !ipe = ip `plusPtr` len
+    wrappedBytesCopyStep (BufferRange ip ipe) k br
+
+-- | GHC represents @NUL@ in string literals via an overlong 2-byte encoding,
+-- which is part of "modified UTF-8" (GHC does not also implement CESU-8).
+modifiedUtf8NUL :: CString
+modifiedUtf8NUL = Ptr "\xc0\x80"#
+
+-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
+-- encoded strings that may contain embedded overlong-encodings (as the
+-- two-byte sequence @0xC0 0x80@) of null characters.
+--
+-- @since 0.11.5.0
+{-# INLINABLE modUtf8LitCopy #-}
+modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
+modUtf8LitCopy = \ !ip !len -> builder $ \k br -> do
+    nullAt <- c_strstr (castPtr ip) modifiedUtf8NUL
+    modUtf8_step ip len nullAt k br
+
+modUtf8_step :: Ptr Word8 -> Int -> Ptr Word8 -> BuildStep r -> BuildStep r
+modUtf8_step !ip !len ((== nullPtr) -> True) k br =
+    -- Contains no encoded nulls, use simple copy codepath
+    wrappedBytesCopyStep (BufferRange ip ipe) k br
+  where
+    !ipe = ip `plusPtr` len
+modUtf8_step !ip !len !nullAt k (BufferRange op0 ope)
+    -- Copy as much of the null-free portion of the string as fits into the
+    -- available buffer space. If the string is long enough, we may have asked
+    -- for less than its full length, filling the buffer with the rest will go
+    -- into the next builder step.
+    | avail > nullFree = do
+        when (nullFree > 0) (copyBytes op0 ip nullFree)
+        pokeElemOff op0 nullFree 0
+        let used = nullFree + 2
+            len' = len - used
+            !ip' = ip `plusPtr` used
+            !op' = op0 `plusPtr` (nullFree + 1)
+        nullAt' <- c_strstr ip' modifiedUtf8NUL
+        modUtf8_step ip' len' nullAt' k (BufferRange op' ope)
+    | avail > 0 = do
+        -- avail <= nullFree
+        copyBytes op0 ip avail
+        let len' = len - avail
+            !ip' = ip `plusPtr` avail
+            !op' = op0 `plusPtr` avail
+        return $ bufferFull 1 op' (modUtf8_step ip' len' nullAt k)
+    | otherwise =
+        return $ bufferFull 1 op0 (modUtf8_step ip len nullAt k)
+  where
+    !avail = ope `minusPtr` op0
+    !nullFree = nullAt `minusPtr` ip
+
+foreign import ccall unsafe "string.h strstr" c_strstr
+    :: CString -> CString -> IO (Ptr Word8)
+
+
 -- Short bytestrings
 ------------------------------------------------------------------------------
 

diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs
@@ -663,59 +663,25 @@ primMapLazyByteStringBounded w =
     L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty
 
 
-------------------------------------------------------------------------------
--- Raw CString encoding
-------------------------------------------------------------------------------
-
--- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
--- Null characters are not representable.
+-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
+-- strings that are free of embedded (overlong-encoded as the two-byte sequence
+-- @0xC0 0x80@) null characters.
 --
 -- @since 0.11.0.0
+{-# DEPRECATED cstring "Use ascLiteralCopy instead" #-}
 cstring :: Addr# -> Builder
-cstring =
-    \addr0 -> builder $ step addr0
-  where
-    step :: Addr# -> BuildStep r -> BuildStep r
-    step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
-      | W8# ch == 0 = k br
-      | op0 == ope =
-          return $ bufferFull 1 op0 (step addr k)
-      | otherwise = do
-          IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 1#) k br'
-      where
-        !ch = indexWord8OffAddr# addr 0#
+cstring s = ascLiteralCopy (Ptr s) (S.byteCountLiteral s)
+{-# INLINE cstring #-}
 
--- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
--- Null characters can be encoded as @0xc0 0x80@.
+-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
+-- encoded strings that may contain embedded overlong-encodings (as the
+-- two-byte sequence @0xC0 0x80@) of null characters.
 --
 -- @since 0.11.0.0
+{-# DEPRECATED cstringUtf8 "Use modUtf8LitCopy instead" #-}
 cstringUtf8 :: Addr# -> Builder
-cstringUtf8 =
-    \addr0 -> builder $ step addr0
-  where
-    step :: Addr# -> BuildStep r -> BuildStep r
-    step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
-      | W8# ch == 0 = k br
-      | op0 == ope =
-          return $ bufferFull 1 op0 (step addr k)
-        -- NULL is encoded as 0xc0 0x80
-      | W8# ch == 0xc0
-      , W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
-          let !(W8# nullByte#) = 0
-          IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 2#) k br'
-      | otherwise = do
-          IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
-                       s' -> (# s', () #)
-          let br' = BufferRange (op0 `plusPtr` 1) ope
-          step (addr `plusAddr#` 1#) k br'
-      where
-        !ch = indexWord8OffAddr# addr 0#
+cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s)
+{-# INLINE cstringUtf8 #-}
 
 ------------------------------------------------------------------------------
 -- Char8 encoding

diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs
@@ -36,7 +36,7 @@ module Data.ByteString.Internal (
         unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
         unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
         unsafePackAddress, unsafePackLenAddress,
-        unsafePackLiteral, unsafePackLenLiteral,
+        unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,
 
         -- * Low level imperative construction
         empty,

diff --git a/Data/ByteString/Internal/Type.hs b/Data/ByteString/Internal/Type.hs
@@ -39,7 +39,7 @@ module Data.ByteString.Internal.Type (
         unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
         unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
         unsafePackAddress, unsafePackLenAddress,
-        unsafePackLiteral, unsafePackLenLiteral,
+        unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,
 
         -- * Low level imperative construction
         empty,
@@ -434,13 +434,22 @@ unsafePackLenAddress len addr# = do
 -- @since 0.11.1.0
 unsafePackLiteral :: Addr# -> ByteString
 unsafePackLiteral addr# =
+  unsafePackLenLiteral (byteCountLiteral addr#) addr#
+{-# INLINE unsafePackLiteral #-}
+
+-- | Byte count of null-terminated primitive literal string excluding the
+-- terminating null byte.
+byteCountLiteral :: Addr# -> Int
+byteCountLiteral addr# =
 #if __GLASGOW_HASKELL__ >= 811
-  unsafePackLenLiteral (I# (cstringLength# addr#)) addr#
+  I# (cstringLength# addr#)
 #else
-  let len = accursedUnutterablePerformIO (c_strlen (Ptr addr#))
-   in unsafePackLenLiteral (fromIntegral len) addr#
+  fromIntegral (pure_strlen (Ptr addr#))
+
+foreign import ccall unsafe "string.h strlen" pure_strlen
+    :: CString -> CSize
 #endif
-{-# INLINE unsafePackLiteral #-}
+{-# INLINE byteCountLiteral #-}
 
 
 -- | See 'unsafePackLiteral'. This function is similar,

diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs
@@ -25,6 +25,7 @@ import qualified Data.ByteString                       as S
 import qualified Data.ByteString.Char8                 as S8
 import qualified Data.ByteString.Lazy                  as L
 import qualified Data.ByteString.Lazy.Char8            as L8
+import           Data.ByteString.Internal              (byteCountLiteral)
 
 import           Data.ByteString.Builder
 import           Data.ByteString.Builder.Extra         (byteStringCopy,
@@ -33,10 +34,13 @@ import           Data.ByteString.Builder.Extra         (byteStringCopy,
 import           Data.ByteString.Builder.Internal      (ensureFree)
 import           Data.ByteString.Builder.Prim          (BoundedPrim, FixedPrim,
                                                         (>$<))
+import qualified Data.ByteString.Builder.Internal      as BI
 import qualified Data.ByteString.Builder.Prim          as P
 import qualified Data.ByteString.Builder.Prim.Internal as PI
 
 import           Foreign
+import           GHC.Exts (Addr#)
+import           GHC.Ptr (Ptr(..))
 
 import System.Random
 
@@ -247,6 +251,18 @@ largeTraversalInput = S.concat (replicate 10 byteStringData)
 smallTraversalInput :: S.ByteString
 smallTraversalInput = S8.pack "The quick brown fox"
 
+ascBuf, utfBuf :: Ptr Word8
+ascBuf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
+utfBuf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
+
+asclit, utflit :: Ptr Word8 -> Builder
+asclit str@(Ptr addr) = BI.ascLiteralCopy str (byteCountLiteral addr)
+utflit str@(Ptr addr) = BI.modUtf8LitCopy str (byteCountLiteral addr)
+
+ascStr, utfStr :: String
+ascStr = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+utfStr = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+
 main :: IO ()
 main = do
   defaultMain
@@ -256,9 +272,15 @@ main = do
         , benchB' "ensureFree 8"  ()  (const (ensureFree 8))
         , benchB' "intHost 1"     1   intHost
         , benchB' "UTF-8 String (naive)" "hello world\0" fromString
-        , benchB' "UTF-8 String"  () $ \() -> P.cstringUtf8 "hello world\0"#
+        , benchB' "UTF-8 String"  () $ \() -> utflit (Ptr "hello world\xc0\x80"#)
         , benchB' "String (naive)" "hello world!" fromString
-        , benchB' "String"        () $ \() -> P.cstring "hello world!"#
+        , benchB' "String"        () $ \() -> asclit (Ptr "hello world!"#)
+        , benchB' "AsciiLit"      () $ \() -> asclit ascBuf
+        , benchB' "Utf8Lit"       () $ \() -> utflit utfBuf
+        , benchB' "strLit"        () $ \() -> string8 ascStr
+        , benchB' "utfLit"        () $ \() -> stringUtf8 utfStr
+        , benchB' "strLitInline"  () $ \() -> string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
+        , benchB' "utfLitInline"  () $ \() -> stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
         ]
 
       , bgroup "Encoding wrappers"

diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs
@@ -24,17 +24,7 @@ import           Test.Tasty.QuickCheck
 
 tests :: [TestTree]
 tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
-               , testsCombinatorsB, [testCString, testCStringUtf8] ]
-
-testCString :: TestTree
-testCString = testProperty "cstring" $
-    toLazyByteString (BP.cstring "hello world!"#) ==
-      LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"
-
-testCStringUtf8 :: TestTree
-testCStringUtf8 = testProperty "cstringUtf8" $
-    toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
-      LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
+               , testsCombinatorsB ]
 
 ------------------------------------------------------------------------------
 -- Binary