From e6cc4a29fded7ffe09beb3bac8ec9bb84a610f9c Mon Sep 17 00:00:00 2001 From: Viktor Dukhovni Date: Sat, 14 Jan 2023 01:44:20 -0500 Subject: [PATCH] Further tuneup of cstring{,Utf8} Moved to Data.ByteString.Builder.Internal, as these no longer have anything to do with 'BoundedPrim', and can benefit from supporting internal code in their new home. --- Data/ByteString/Builder/Internal.hs | 78 ++++++++++++++- Data/ByteString/Builder/Prim.hs | 98 +------------------ Data/ByteString/Internal.hs | 1 - Data/ByteString/Internal/Type.hs | 4 - bench/BenchAll.hs | 2 + .../Data/ByteString/Builder/Prim/Tests.hs | 15 +-- .../builder/Data/ByteString/Builder/Tests.hs | 19 +++- 7 files changed, 102 insertions(+), 115 deletions(-) diff --git a/Data/ByteString/Builder/Internal.hs b/Data/ByteString/Builder/Internal.hs index a05beead9..181808508 100644 --- a/Data/ByteString/Builder/Internal.hs +++ b/Data/ByteString/Builder/Internal.hs @@ -1,5 +1,5 @@ {-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns, RankNTypes, TupleSections #-} -{-# LANGUAGE Unsafe #-} +{-# LANGUAGE MagicHash, ViewPatterns, Unsafe #-} {-# OPTIONS_HADDOCK not-home #-} -- | Copyright : (c) 2010 - 2011 Simon Meier -- License : BSD3-style (see LICENSE) @@ -86,6 +86,8 @@ module Data.ByteString.Builder.Internal ( , byteStringCopy , byteStringInsert , byteStringThreshold + , cstring + , cstringUtf8 , lazyByteStringCopy , lazyByteStringInsert @@ -127,6 +129,7 @@ module Data.ByteString.Builder.Internal ( ) where import Control.Arrow (second) +import Control.Monad (when) #if !(MIN_VERSION_base(4,11,0)) import Data.Semigroup (Semigroup((<>))) @@ -140,10 +143,12 @@ import qualified Data.ByteString.Short.Internal as Sh import qualified GHC.IO.Buffer as IO (Buffer(..), newByteBuffer) import GHC.IO.Handle.Internals (wantWritableHandle, flushWriteBuffer) import GHC.IO.Handle.Types (Handle__, haByteBuffer, haBufferMode) +import GHC.Exts import System.IO (hFlush, BufferMode(..), Handle) import Data.IORef import Foreign +import Foreign.C.String (CString) import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) import System.IO.Unsafe (unsafeDupablePerformIO) @@ -857,6 +862,77 @@ byteStringInsert :: S.ByteString -> Builder byteStringInsert = \bs -> builder $ \k (BufferRange op _) -> return $ insertChunk op bs k + +------------------------------------------------------------------------------ +-- Raw CString encoding +------------------------------------------------------------------------------ + +-- | A null-terminated ASCII encoded 'Foreign.C.String.CString'. +-- Null characters are not representable. +-- +-- @since 0.11.5.0 +{-# INLINABLE cstring #-} +cstring :: Addr# -> Builder +cstring = \addr -> builder $ \k br -> do + let ip = Ptr addr +#if __GLASGOW_HASKELL__ >= 811 + ipe = Ptr (addr `plusAddr#` (cstringLength# addr)) +#else + !ipe <- plusPtr ip . fromIntegral <$> S.c_strlen ip +#endif + wrappedBytesCopyStep (BufferRange ip ipe) k br + +-- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'. +-- Null characters can be encoded as @0xc0 0x80@. +-- +-- @since 0.11.5.0 +cstringUtf8 :: Addr# -> Builder +cstringUtf8 = \addr0 -> builder $ \k br -> do +#if __GLASGOW_HASKELL__ >= 811 + let len = cstringLength# addr0 +#else + (I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0) +#endif + nullAt <- c_strstr (Ptr addr0) (Ptr "\xc0\x80"#) + cstringUtf8_step addr0 len nullAt k br +{-# INLINABLE cstringUtf8 #-} + +cstringUtf8_step :: Addr# -> Int# -> Ptr Word8 -> BuildStep r -> BuildStep r +cstringUtf8_step addr len ((== nullPtr) -> True) k br = + -- Contains no encoded nulls, use simple copy codepath + wrappedBytesCopyStep (BufferRange ip ipe) k br + where + ip = Ptr addr + ipe = Ptr (addr `plusAddr#` len) +cstringUtf8_step addr len !nullAt k (BufferRange op0 ope) + -- Copy as much of the null-free portion of the string as fits into the + -- available buffer space. If the string is long enough, we may have asked + -- for less than its full length, filling the buffer with the rest will go + -- into the next builder step. + | avail > nullFree = do + when (nullFree > 0) (S.memcpy op0 (Ptr addr) nullFree) + pokeElemOff op0 nullFree 0 + let !op' = op0 `plusPtr` (nullFree + 1) + nread# = nullFree# +# 2# + addr' = addr `plusAddr#` nread# + len' = len -# nread# + nullAt' <- c_strstr (Ptr addr') (Ptr "\xc0\x80"#) + cstringUtf8_step addr' len' nullAt' k (BufferRange op' ope) + | otherwise = do + let !copy@(I# copy#) = min avail nullFree + when (copy > 0) (S.memcpy op0 (Ptr addr) copy) + let !op' = op0 `plusPtr` copy + addr' = addr `plusAddr#` copy# + len' = len -# copy# + return $ bufferFull 1 op' (cstringUtf8_step addr' len' nullAt k) + where + !avail = ope `minusPtr` op0 + !nullFree@(I# nullFree#) = nullAt `minusPtr` (Ptr addr) + +foreign import ccall unsafe "string.h strstr" c_strstr + :: CString -> CString -> IO (Ptr Word8) + + -- Short bytestrings ------------------------------------------------------------------------------ diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs index 220bd310c..0fe7f3841 100644 --- a/Data/ByteString/Builder/Prim.hs +++ b/Data/ByteString/Builder/Prim.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE BangPatterns, CPP, ScopedTypeVariables #-} +{-# LANGUAGE BangPatterns, ScopedTypeVariables #-} {-# LANGUAGE MagicHash, UnboxedTuples, PatternGuards #-} {-# OPTIONS_GHC -fno-warn-unused-imports #-} {-# LANGUAGE Trustworthy #-} @@ -433,8 +433,8 @@ module Data.ByteString.Builder.Prim ( -- a decimal number with UTF-8 encoded characters. , charUtf8 - , cstring - , cstringUtf8 + , cstring -- Backwards-compatibility re-exports from Internal.hs + , cstringUtf8 -- these no longer make use of the BoundPrim API. {- -- * Testing support @@ -468,6 +468,7 @@ import Data.ByteString.Builder.Prim.ASCII import Foreign import Foreign.C.Types +import Foreign.C.String (CString) import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) import GHC.Int (Int (..)) import GHC.Word (Word8 (..)) @@ -664,97 +665,6 @@ primMapLazyByteStringBounded w = L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty ------------------------------------------------------------------------------- --- Raw CString encoding ------------------------------------------------------------------------------- - --- | A null-terminated ASCII encoded 'Foreign.C.String.CString'. --- Null characters are not representable. --- --- @since 0.11.0.0 -cstring :: Addr# -> Builder -cstring = \addr0 -> builder $ \k br -> do -#if __GLASGOW_HASKELL__ >= 811 - let len = cstringLength# addr0 -#else - (I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0) -#endif - cstring_step addr0 len k br -{-# INLINE cstring #-} - -cstring_step :: Addr# -> Int# -> BuildStep r -> BuildStep r -cstring_step !addr !len !k br@(BufferRange op0 ope) - -- String is empty, process the continuation - | (I# len) == 0 = k br - -- Buffer is full, allocate some more... We ask for just one more - -- byte, but the builder allocation strategy will in practice give - -- us more space, which we'll consume in a single step. - | op0 == ope = - return $ bufferFull 1 op0 (cstring_step addr len k) - -- Copy as much of the string as fits into the available buffer space. - -- If the string is long enough, we may have asked for less than its - -- full length, filling the buffer with the rest will go into the next - -- builder step. - | otherwise = do - let !avail@(I# avail#) = min (I# len) (ope `minusPtr` op0) - br' = BufferRange (op0 `plusPtr` avail) ope - addr' = addr `plusAddr#` avail# - len' = len -# avail# - S.memcpy op0 (Ptr addr) avail - cstring_step addr' len' k br' - --- | A null-terminated UTF-8 encoded 'Foreign.C.String.CString'. --- Null characters can be encoded as @0xc0 0x80@. --- --- @since 0.11.0.0 -cstringUtf8 :: Addr# -> Builder -cstringUtf8 = \addr0 -> builder $ \k br -> do -#if __GLASGOW_HASKELL__ >= 811 - let len = cstringLength# addr0 -#else - (I# len) <- fromIntegral <$> S.c_strlen (Ptr addr0) -#endif - nullAt <- S.c_strstr (Ptr addr0) (Ptr "\xc0\x80"#) - cstringUtf8_step addr0 len nullAt k br -{-# INLINE cstringUtf8 #-} - -cstringUtf8_step :: Addr# -> Int# -> Ptr Word8 -> BuildStep r -> BuildStep r -cstringUtf8_step !addr !len !nullAt !k br@(BufferRange op0@(Ptr op0#) ope) - -- String is empty, process the continuation - | (I# len) == 0 = k br - -- Contains no encoded nulls, use simpler 'cstring' code - | nullPtr == nullAt = - cstring_step addr len k br - -- Buffer is full, allocate some more... We ask for just one more - -- byte, but the builder allocation strategy will in practice give - -- us more space, which we'll consume in a single step. - | op0 == ope = - return $ bufferFull 1 op0 (cstringUtf8_step addr len nullAt k) - -- We're at the encoded null-byte, append a '\0' to the buffer and - -- continue with the rest of the input string, after locating the - -- next encoded null-byte, if any. - | (Ptr addr) == nullAt = do - let !(W8# nullByte#) = 0 - IO $ \s -> case writeWord8OffAddr# op0# 0# nullByte# s of - s' -> (# s', () #) - let br' = BufferRange (op0 `plusPtr` 1) ope - addr' = addr `plusAddr#` 2# - len' = len -# 2# - nullAt' <- S.c_strstr (Ptr addr') (Ptr "\xc0\x80"#) - cstringUtf8_step addr' len' nullAt' k br' - -- Copy as much of the null-free portion of the string as fits into the - -- available buffer space. If the string is long enough, we may have asked - -- for less than its full length, filling the buffer with the rest will go - -- into the next builder step. - | otherwise = do - let !nullFree = nullAt `minusPtr` (Ptr addr) - !avail@(I# avail#) = min nullFree (ope `minusPtr` op0) - br' = BufferRange (op0 `plusPtr` avail) ope - addr' = addr `plusAddr#` avail# - len' = len -# avail# - S.memcpy op0 (Ptr addr) avail - cstringUtf8_step addr' len' nullAt k br' - ------------------------------------------------------------------------------ -- Char8 encoding ------------------------------------------------------------------------------ diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs index a0fd23d55..b4481a833 100644 --- a/Data/ByteString/Internal.hs +++ b/Data/ByteString/Internal.hs @@ -65,7 +65,6 @@ module Data.ByteString.Internal ( -- * Standard C Functions c_strlen, - c_strstr, c_free_finalizer, memchr, diff --git a/Data/ByteString/Internal/Type.hs b/Data/ByteString/Internal/Type.hs index eb1d2b316..24596c4f6 100644 --- a/Data/ByteString/Internal/Type.hs +++ b/Data/ByteString/Internal/Type.hs @@ -82,7 +82,6 @@ module Data.ByteString.Internal.Type ( -- * Standard C Functions c_strlen, - c_strstr, c_free_finalizer, memchr, @@ -1002,9 +1001,6 @@ accursedUnutterablePerformIO (IO m) = case m realWorld# of (# _, r #) -> r foreign import ccall unsafe "string.h strlen" c_strlen :: CString -> IO CSize -foreign import ccall unsafe "string.h strstr" c_strstr - :: CString -> CString -> IO (Ptr Word8) - foreign import ccall unsafe "static stdlib.h &free" c_free_finalizer :: FunPtr (Ptr Word8 -> IO ()) diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 3daa09463..6d40718fb 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -259,6 +259,8 @@ main = do , benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"# , benchB' "String (naive)" "hello world!" fromString , benchB' "String" () $ \() -> P.cstring "hello world!"# + , benchB' "AsciiLit64" () $ \() -> P.cstring "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"# + , benchB' "Utf8Lit64" () $ \() -> P.cstringUtf8 "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xc0\x80xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"# ] , bgroup "Encoding wrappers" diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs index 9d499b80d..fa1ae5894 100644 --- a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs @@ -24,20 +24,7 @@ import Test.Tasty.QuickCheck tests :: [TestTree] tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8 - , testsCombinatorsB, [testCString, testCStringUtf8] ] - -testCString :: TestTree -testCString = testProperty "cstring" $ - toLazyByteString (BP.cstring "hello world!"#) == - LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!" - -testCStringUtf8 :: TestTree -testCStringUtf8 = testProperty "cstringUtf8" $ - toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world\xc0\x80!"#) == - LC.pack "hello" `L.append` L.singleton 0x00 - `L.append` LC.pack "world" - `L.append` L.singleton 0x00 - `L.append` LC.singleton '!' + , testsCombinatorsB ] ------------------------------------------------------------------------------ -- Binary diff --git a/tests/builder/Data/ByteString/Builder/Tests.hs b/tests/builder/Data/ByteString/Builder/Tests.hs index fa58645e4..a8a6ab266 100644 --- a/tests/builder/Data/ByteString/Builder/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Tests.hs @@ -36,6 +36,7 @@ import Data.Word import qualified Data.ByteString as S import qualified Data.ByteString.Internal as S import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString.Lazy.Char8 as LC import qualified Data.ByteString.Short as Sh import Data.ByteString.Builder @@ -73,7 +74,8 @@ tests = testsASCII ++ testsFloating ++ testsChar8 ++ - testsUtf8 + testsUtf8 ++ + testCString ------------------------------------------------------------------------------ @@ -988,3 +990,18 @@ testsUtf8 = [ testBuilderConstr "charUtf8" charUtf8_list charUtf8 , testBuilderConstr "stringUtf8" (foldMap charUtf8_list) stringUtf8 ] + +testCString :: [TestTree] +testCString = + [ testProperty "cstring" $ + toLazyByteString (BI.cstring "hello world!"#) == + LC.pack "hello" `L.append` L.singleton 0x20 + `L.append` LC.pack "world!" + , testProperty "cstringUtf8" $ + toLazyByteString (BI.cstringUtf8 "hello\xc0\x80\xc0\x80world\xc0\x80!"#) == + LC.pack "hello" `L.append` L.singleton 0x00 + `L.append` L.singleton 0x00 + `L.append` LC.pack "world" + `L.append` L.singleton 0x00 + `L.append` LC.singleton '!' + ]