Skip to content

Commit

Permalink
Avoid per-byte loop in cstring{,Utf8} builders
Browse files Browse the repository at this point in the history
Copy chunks of the input to the output buffer with 'memcpy', up to the shorter
of the available buffer space and the "null-free" portion of the remaining
string.  For the UTF8 version, encoded NUL bytes are located via strstr(3).
  • Loading branch information
hs-viktor committed Feb 9, 2023
1 parent 0bd68ca commit 0645428
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 74 deletions.
21 changes: 14 additions & 7 deletions Data/ByteString/Builder.hs
Original file line number Diff line number Diff line change
Expand Up @@ -263,12 +263,14 @@ import qualified Data.ByteString.Builder.Prim as P
import qualified Data.ByteString.Lazy.Internal as L
import Data.ByteString.Builder.ASCII
import Data.ByteString.Builder.RealFloat
import Data.ByteString.Internal (byteCountLiteral)

import Data.String (IsString(..))
import System.IO (Handle, IOMode(..), withBinaryFile)
import Foreign
import GHC.Base (unpackCString#, unpackCStringUtf8#,
unpackFoldrCString#, build)
import GHC.Ptr (Ptr(..))

-- | Execute a 'Builder' and return the generated chunks as a lazy 'L.ByteString'.
-- The work is performed lazy, i.e., only when a chunk of the lazy 'L.ByteString'
Expand Down Expand Up @@ -440,18 +442,20 @@ char8 :: Char -> Builder
char8 = P.primFixed P.char8

-- | Char8 encode a 'String'.
{-# INLINE [1] string8 #-} -- phased to allow P.cstring rewrite
{-# NOINLINE string8 #-}
string8 :: String -> Builder
string8 = P.primMapListFixed P.char8

-- GHC desugars string literals with unpackCString# which the simplifier tends
-- to promptly turn into build (unpackFoldrCString# s), so we match on both.
{-# RULES
"string8/unpackCString#" forall s.
string8 (unpackCString# s) = P.cstring s
string8 (unpackCString# s) =
ascLiteralCopy (Ptr s) (byteCountLiteral s)

"string8/unpackFoldrCString#" forall s.
string8 (build (unpackFoldrCString# s)) = P.cstring s
string8 (build (unpackFoldrCString# s)) =
ascLiteralCopy (Ptr s) (byteCountLiteral s)
#-}

------------------------------------------------------------------------------
Expand All @@ -467,19 +471,22 @@ charUtf8 = P.primBounded P.charUtf8
--
-- Note that 'stringUtf8' performs no codepoint validation and consequently may
-- emit invalid UTF-8 if asked (e.g. single surrogates).
{-# INLINE [1] stringUtf8 #-} -- phased to allow P.cstring rewrite
{-# NOINLINE stringUtf8 #-}
stringUtf8 :: String -> Builder
stringUtf8 = P.primMapListBounded P.charUtf8

{-# RULES
"stringUtf8/unpackCStringUtf8#" forall s.
stringUtf8 (unpackCStringUtf8# s) = P.cstringUtf8 s
stringUtf8 (unpackCStringUtf8# s) =
modUtf8LitCopy (Ptr s) (byteCountLiteral s)

"stringUtf8/unpackCString#" forall s.
stringUtf8 (unpackCString# s) = P.cstring s
stringUtf8 (unpackCString# s) =
ascLiteralCopy (Ptr s) (byteCountLiteral s)

"stringUtf8/unpackFoldrCString#" forall s.
stringUtf8 (build (unpackFoldrCString# s)) = P.cstring s
stringUtf8 (build (unpackFoldrCString# s)) =
ascLiteralCopy (Ptr s) (byteCountLiteral s)
#-}

instance IsString Builder where
Expand Down
76 changes: 75 additions & 1 deletion Data/ByteString/Builder/Internal.hs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns, RankNTypes, TupleSections #-}
{-# LANGUAGE Unsafe #-}
{-# LANGUAGE MagicHash, ViewPatterns, Unsafe #-}
{-# OPTIONS_HADDOCK not-home #-}
-- | Copyright : (c) 2010 - 2011 Simon Meier
-- License : BSD3-style (see LICENSE)
Expand Down Expand Up @@ -84,6 +84,8 @@ module Data.ByteString.Builder.Internal (
-- , sizedChunksInsert

, byteStringCopy
, ascLiteralCopy
, modUtf8LitCopy
, byteStringInsert
, byteStringThreshold

Expand Down Expand Up @@ -127,6 +129,7 @@ module Data.ByteString.Builder.Internal (
) where

import Control.Arrow (second)
import Control.Monad (when)

#if !(MIN_VERSION_base(4,11,0))
import Data.Semigroup (Semigroup((<>)))
Expand All @@ -140,10 +143,12 @@ import qualified Data.ByteString.Short.Internal as Sh
import qualified GHC.IO.Buffer as IO (Buffer(..), newByteBuffer)
import GHC.IO.Handle.Internals (wantWritableHandle, flushWriteBuffer)
import GHC.IO.Handle.Types (Handle__, haByteBuffer, haBufferMode)
import GHC.Ptr (Ptr(..))
import System.IO (hFlush, BufferMode(..), Handle)
import Data.IORef

import Foreign
import Foreign.C.String (CString)
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
import System.IO.Unsafe (unsafeDupablePerformIO)

Expand Down Expand Up @@ -857,6 +862,75 @@ byteStringInsert :: S.ByteString -> Builder
byteStringInsert =
\bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
-- strings that are free of embedded (overlong-encoded as the two-byte sequence
-- @0xC0 0x80@) null characters.
--
-- @since 0.11.5.0
{-# INLINABLE ascLiteralCopy #-}
ascLiteralCopy :: Ptr Word8 -> Int -> Builder
ascLiteralCopy = \ !ip !len -> builder $ \k br -> do
let !ipe = ip `plusPtr` len
wrappedBytesCopyStep (BufferRange ip ipe) k br

-- | GHC represents @NUL@ in string literals via an overlong 2-byte encoding,
-- which is part of "modified UTF-8" (GHC does not also implement CESU-8).
modifiedUtf8NUL :: CString
modifiedUtf8NUL = Ptr "\xc0\x80"#

-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
-- encoded strings that may contain embedded overlong-encodings (as the
-- two-byte sequence @0xC0 0x80@) of null characters.
--
-- @since 0.11.5.0
{-# INLINABLE modUtf8LitCopy #-}
modUtf8LitCopy :: Ptr Word8 -> Int -> Builder
modUtf8LitCopy = \ !ip !len -> builder $ \k br -> do
nullAt <- c_strstr (castPtr ip) modifiedUtf8NUL
modUtf8_step ip len nullAt k br

modUtf8_step :: Ptr Word8 -> Int -> Ptr Word8 -> BuildStep r -> BuildStep r
modUtf8_step !ip !len ((== nullPtr) -> True) k br =
-- Contains no encoded nulls, use simple copy codepath
wrappedBytesCopyStep (BufferRange ip ipe) k br
where
!ipe = ip `plusPtr` len
modUtf8_step !ip !len !nullAt k (BufferRange op0 ope)
-- Copy as much of the null-free portion of the string as fits into the
-- available buffer space. If the string is long enough, we may have asked
-- for less than its full length, filling the buffer with the rest will go
-- into the next builder step.
| avail > nullFree = do
when (nullFree > 0) (copyBytes op0 ip nullFree)
pokeElemOff op0 nullFree 0
let used = nullFree + 2
len' = len - used
!ip' = ip `plusPtr` used
!op' = op0 `plusPtr` (nullFree + 1)
nullAt' <- c_strstr ip' modifiedUtf8NUL
modUtf8_step ip' len' nullAt' k (BufferRange op' ope)
| avail > 0 = do
-- avail <= nullFree
copyBytes op0 ip avail
let len' = len - avail
!ip' = ip `plusPtr` avail
!op' = op0 `plusPtr` avail
return $ bufferFull 1 op' (modUtf8_step ip' len' nullAt k)
| otherwise =
return $ bufferFull 1 op0 (modUtf8_step ip len nullAt k)
where
!avail = ope `minusPtr` op0
!nullFree = nullAt `minusPtr` ip

foreign import ccall unsafe "string.h strstr" c_strstr
:: CString -> CString -> IO (Ptr Word8)


-- Short bytestrings
------------------------------------------------------------------------------

Expand Down
58 changes: 12 additions & 46 deletions Data/ByteString/Builder/Prim.hs
Original file line number Diff line number Diff line change
Expand Up @@ -663,59 +663,25 @@ primMapLazyByteStringBounded w =
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'.
-- Null characters are not representable.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive ASCII
-- strings that are free of embedded (overlong-encoded as the two-byte sequence
-- @0xC0 0x80@) null characters.
--
-- @since 0.11.0.0
{-# DEPRECATED cstring "Use ascLiteralCopy instead" #-}
cstring :: Addr# -> Builder
cstring =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstring s = ascLiteralCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstring #-}

-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'.
-- Null characters can be encoded as @0xc0 0x80@.
-- | Builder for raw 'Addr#' pointers to null-terminated primitive UTF-8
-- encoded strings that may contain embedded overlong-encodings (as the
-- two-byte sequence @0xC0 0x80@) of null characters.
--
-- @since 0.11.0.0
{-# DEPRECATED cstringUtf8 "Use modUtf8LitCopy instead" #-}
cstringUtf8 :: Addr# -> Builder
cstringUtf8 =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k br@(BufferRange op0@(Ptr op0#) ope)
| W8# ch == 0 = k br
| op0 == ope =
return $ bufferFull 1 op0 (step addr k)
-- NULL is encoded as 0xc0 0x80
| W8# ch == 0xc0
, W8# (indexWord8OffAddr# addr 1#) == 0x80 = do
let !(W8# nullByte#) = 0
IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 2#) k br'
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#
cstringUtf8 s = modUtf8LitCopy (Ptr s) (S.byteCountLiteral s)
{-# INLINE cstringUtf8 #-}

------------------------------------------------------------------------------
-- Char8 encoding
Expand Down
2 changes: 1 addition & 1 deletion Data/ByteString/Internal.hs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ module Data.ByteString.Internal (
unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
unsafePackAddress, unsafePackLenAddress,
unsafePackLiteral, unsafePackLenLiteral,
unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,

-- * Low level imperative construction
empty,
Expand Down
19 changes: 14 additions & 5 deletions Data/ByteString/Internal/Type.hs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ module Data.ByteString.Internal.Type (
unpackBytes, unpackAppendBytesLazy, unpackAppendBytesStrict,
unpackChars, unpackAppendCharsLazy, unpackAppendCharsStrict,
unsafePackAddress, unsafePackLenAddress,
unsafePackLiteral, unsafePackLenLiteral,
unsafePackLiteral, unsafePackLenLiteral, byteCountLiteral,

-- * Low level imperative construction
empty,
Expand Down Expand Up @@ -434,13 +434,22 @@ unsafePackLenAddress len addr# = do
-- @since 0.11.1.0
unsafePackLiteral :: Addr# -> ByteString
unsafePackLiteral addr# =
unsafePackLenLiteral (byteCountLiteral addr#) addr#
{-# INLINE unsafePackLiteral #-}

-- | Byte count of null-terminated primitive literal string excluding the
-- terminating null byte.
byteCountLiteral :: Addr# -> Int
byteCountLiteral addr# =
#if __GLASGOW_HASKELL__ >= 811
unsafePackLenLiteral (I# (cstringLength# addr#)) addr#
I# (cstringLength# addr#)
#else
let len = accursedUnutterablePerformIO (c_strlen (Ptr addr#))
in unsafePackLenLiteral (fromIntegral len) addr#
fromIntegral (pure_strlen (Ptr addr#))

foreign import ccall unsafe "string.h strlen" pure_strlen
:: CString -> CSize
#endif
{-# INLINE unsafePackLiteral #-}
{-# INLINE byteCountLiteral #-}


-- | See 'unsafePackLiteral'. This function is similar,
Expand Down
26 changes: 24 additions & 2 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import qualified Data.ByteString as S
import qualified Data.ByteString.Char8 as S8
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as L8
import Data.ByteString.Internal (byteCountLiteral)

import Data.ByteString.Builder
import Data.ByteString.Builder.Extra (byteStringCopy,
Expand All @@ -33,10 +34,13 @@ import Data.ByteString.Builder.Extra (byteStringCopy,
import Data.ByteString.Builder.Internal (ensureFree)
import Data.ByteString.Builder.Prim (BoundedPrim, FixedPrim,
(>$<))
import qualified Data.ByteString.Builder.Internal as BI
import qualified Data.ByteString.Builder.Prim as P
import qualified Data.ByteString.Builder.Prim.Internal as PI

import Foreign
import GHC.Exts (Addr#)
import GHC.Ptr (Ptr(..))

import System.Random

Expand Down Expand Up @@ -247,6 +251,18 @@ largeTraversalInput = S.concat (replicate 10 byteStringData)
smallTraversalInput :: S.ByteString
smallTraversalInput = S8.pack "The quick brown fox"

ascBuf, utfBuf :: Ptr Word8
ascBuf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#
utfBuf = Ptr "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"#

asclit, utflit :: Ptr Word8 -> Builder
asclit str@(Ptr addr) = BI.ascLiteralCopy str (byteCountLiteral addr)
utflit str@(Ptr addr) = BI.modUtf8LitCopy str (byteCountLiteral addr)

ascStr, utfStr :: String
ascStr = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
utfStr = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\0xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"

main :: IO ()
main = do
defaultMain
Expand All @@ -256,9 +272,15 @@ main = do
, benchB' "ensureFree 8" () (const (ensureFree 8))
, benchB' "intHost 1" 1 intHost
, benchB' "UTF-8 String (naive)" "hello world\0" fromString
, benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"#
, benchB' "UTF-8 String" () $ \() -> utflit (Ptr "hello world\xc0\x80"#)
, benchB' "String (naive)" "hello world!" fromString
, benchB' "String" () $ \() -> P.cstring "hello world!"#
, benchB' "String" () $ \() -> asclit (Ptr "hello world!"#)
, benchB' "AsciiLit" () $ \() -> asclit ascBuf
, benchB' "Utf8Lit" () $ \() -> utflit utfBuf
, benchB' "strLit" () $ \() -> string8 ascStr
, benchB' "utfLit" () $ \() -> stringUtf8 utfStr
, benchB' "strLitInline" () $ \() -> string8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
, benchB' "utfLitInline" () $ \() -> stringUtf8 "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX\0XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
]

, bgroup "Encoding wrappers"
Expand Down
12 changes: 1 addition & 11 deletions tests/builder/Data/ByteString/Builder/Prim/Tests.hs
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,7 @@ import Test.Tasty.QuickCheck

tests :: [TestTree]
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
, testsCombinatorsB, [testCString, testCStringUtf8] ]

testCString :: TestTree
testCString = testProperty "cstring" $
toLazyByteString (BP.cstring "hello world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"

testCStringUtf8 :: TestTree
testCStringUtf8 = testProperty "cstringUtf8" $
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"
, testsCombinatorsB ]

------------------------------------------------------------------------------
-- Binary
Expand Down
Loading

0 comments on commit 0645428

Please sign in to comment.