From a75a67c7209cf03d60e8f8529e30ffd6bcfb6fb8 Mon Sep 17 00:00:00 2001 From: Ben Gamari Date: Wed, 12 Jul 2017 23:47:45 -0400 Subject: [PATCH 1/8] Test naive String Builder --- bench/BenchAll.hs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 908af4e08..f0586e555 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -15,6 +15,7 @@ module Main (main) where import Criterion.Main import Data.Foldable (foldMap) import Data.Monoid +import Data.String import Prelude hiding (words) import qualified Data.ByteString as S @@ -36,6 +37,8 @@ import qualified Blaze.Text as Blaze import qualified "bytestring" Data.ByteString as OldS import qualified "bytestring" Data.ByteString.Lazy as OldL +import Paths_bench_bytestring + import Foreign import System.Random @@ -133,7 +136,7 @@ benchFE name = benchBE name . P.liftFixedToBounded {-# INLINE benchBE #-} benchBE :: String -> BoundedPrim Int -> Benchmark benchBE name e = - bench (name ++" (" ++ show nRepl ++ ")") $ benchIntEncodingB nRepl e + bench (name ++" (" ++ show nRepl ++ ")") $ whnfIO $ benchIntEncodingB nRepl e -- We use this construction of just looping through @n,n-1,..,1@ to ensure that -- we measure the speed of the encoding and not the speed of generating the @@ -166,7 +169,7 @@ w :: Int -> Word8 w = fromIntegral hashWord8 :: Word8 -> Word8 -hashWord8 = fromIntegral . hashInt . w +hashWord8 = fromIntegral . hashInt . fromIntegral partitionStrict p = nf (S.partition p) . randomStrict $ mkStdGen 98423098 where randomStrict = fst . S.unfoldrN 10000 (Just . random) @@ -235,6 +238,7 @@ main = do [ benchB' "mempty" () (const mempty) , benchB' "ensureFree 8" () (const (ensureFree 8)) , benchB' "intHost 1" 1 intHost + , benchB' "String (naive)" "hello world!" fromString ] , bgroup "Encoding wrappers" From aff9ea27313c5d231f033f4629f270d062fb8d09 Mon Sep 17 00:00:00 2001 From: Ben Gamari Date: Wed, 12 Jul 2017 21:22:04 -0400 Subject: [PATCH 2/8] Test and benchmark cstring --- bench/BenchAll.hs | 2 ++ bench/bench-bytestring.cabal | 1 + tests/builder/Data/ByteString/Builder/Prim/Tests.hs | 9 +++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index f0586e555..8d9baf74d 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -1,6 +1,7 @@ {-# LANGUAGE BangPatterns #-} {-# LANGUAGE PackageImports #-} {-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE MagicHash #-} -- | -- Copyright : (c) 2011 Simon Meier -- License : BSD3-style (see LICENSE) @@ -239,6 +240,7 @@ main = do , benchB' "ensureFree 8" () (const (ensureFree 8)) , benchB' "intHost 1" 1 intHost , benchB' "String (naive)" "hello world!" fromString + , benchB' "String" () $ \() -> P.cstring "hello world!"# ] , bgroup "Encoding wrappers" diff --git a/bench/bench-bytestring.cabal b/bench/bench-bytestring.cabal index 71cc99fd6..dec1c9214 100644 --- a/bench/bench-bytestring.cabal +++ b/bench/bench-bytestring.cabal @@ -34,6 +34,7 @@ flag integer-simple executable bench-bytestring-builder hs-source-dirs: . .. main-is: BenchAll.hs + other-modules: Paths_bench_bytestring build-depends: base >= 4 && < 5 , ghc-prim diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs index ae152e248..8c3cc828c 100644 --- a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE CPP, ScopedTypeVariables #-} +{-# LANGUAGE CPP, ScopedTypeVariables, MagicHash #-} -- | -- Copyright : (c) 2011 Simon Meier @@ -14,12 +14,14 @@ module Data.ByteString.Builder.Prim.Tests (tests) where import Data.Char (ord) import qualified Data.ByteString.Lazy as L +import qualified Data.ByteString.Lazy.Char8 as LC import Data.ByteString.Builder import qualified Data.ByteString.Builder.Prim as BP import Data.ByteString.Builder.Prim.TestUtils #if defined(HAVE_TEST_FRAMEWORK) import Test.Framework +import Test.Framework.Providers.QuickCheck2 #else import TestFramework #endif @@ -27,8 +29,11 @@ import TestFramework tests :: [Test] tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8 - , testsCombinatorsB ] + , testsCombinatorsB, [testCString] ] +testCString :: Test +testCString = testProperty "cstring" $ + toLazyByteString (BP.cstring "hello world!"#) == LC.pack "hello world!" ------------------------------------------------------------------------------ -- Binary From fadbdce7bddcf477cd6a26daca40d37f867d19e0 Mon Sep 17 00:00:00 2001 From: Ben Gamari Date: Tue, 11 Jul 2017 19:43:20 -0400 Subject: [PATCH 3/8] Efficiently copy CStrings --- Data/ByteString/Builder.hs | 32 +++++++++++++++-- Data/ByteString/Builder/Prim.hs | 63 +++++++++++++++++++++++++++++++-- 2 files changed, 90 insertions(+), 5 deletions(-) diff --git a/Data/ByteString/Builder.hs b/Data/ByteString/Builder.hs index dccc88ca7..f2c0f4d2c 100644 --- a/Data/ByteString/Builder.hs +++ b/Data/ByteString/Builder.hs @@ -1,4 +1,4 @@ -{-# LANGUAGE CPP, BangPatterns #-} +{-# LANGUAGE CPP, BangPatterns, MagicHash #-} {-# OPTIONS_GHC -fno-warn-unused-imports -fno-warn-orphans #-} #if __GLASGOW_HASKELL__ >= 701 {-# LANGUAGE Trustworthy #-} @@ -263,6 +263,8 @@ import Data.ByteString.Builder.ASCII import Data.String (IsString(..)) import System.IO (Handle) import Foreign +import GHC.Base (unpackCString#, unpackCStringUtf8#, + unpackFoldrCString#, build) -- HADDOCK only imports import qualified Data.ByteString as S (concat) @@ -431,10 +433,20 @@ char8 :: Char -> Builder char8 = P.primFixed P.char8 -- | Char8 encode a 'String'. -{-# INLINE string8 #-} +{-# INLINE [1] string8 #-} -- phased to allow P.cstring rewrite string8 :: String -> Builder string8 = P.primMapListFixed P.char8 +-- GHC desugars string literals with unpackCString# which the simplifier tends +-- to promptly turn into build (unpackFoldrCString# s), so we match on both. +{-# RULES +"string8/unpackCString#" forall s. + string8 (unpackCString# s) = P.cstring s + +"string8/unpackFoldrCString#" forall s. + string8 (build (unpackFoldrCString# s)) = P.cstring s + #-} + ------------------------------------------------------------------------------ -- UTF-8 encoding ------------------------------------------------------------------------------ @@ -445,9 +457,23 @@ charUtf8 :: Char -> Builder charUtf8 = P.primBounded P.charUtf8 -- | UTF-8 encode a 'String'. -{-# INLINE stringUtf8 #-} +-- +-- Note that 'stringUtf8' performs no codepoint validation and consequently may +-- emit invalid UTF-8 if asked (e.g. single surrogates). +{-# INLINE [1] stringUtf8 #-} -- phased to allow P.cstring rewrite stringUtf8 :: String -> Builder stringUtf8 = P.primMapListBounded P.charUtf8 +{-# RULES +"stringUtf8/unpackCStringUtf8#" forall s. + stringUtf8 (unpackCStringUtf8# s) = P.cstringUtf8 s + +"stringUtf8/unpackCString#" forall s. + stringUtf8 (unpackCString# s) = P.cstring s + +"stringUtf8/unpackFoldrCString#" forall s. + stringUtf8 (build (unpackFoldrCString# s)) = P.cstring s + #-} + instance IsString Builder where fromString = stringUtf8 diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs index 777b3091c..f03ecbb14 100644 --- a/Data/ByteString/Builder/Prim.hs +++ b/Data/ByteString/Builder/Prim.hs @@ -1,4 +1,5 @@ {-# LANGUAGE CPP, BangPatterns, ScopedTypeVariables #-} +{-# LANGUAGE MagicHash, UnboxedTuples, PatternGuards #-} {-# OPTIONS_GHC -fno-warn-unused-imports #-} #if __GLASGOW_HASKELL__ == 700 -- This is needed as a workaround for an old bug in GHC 7.0.1 (Trac #4498) @@ -437,6 +438,9 @@ module Data.ByteString.Builder.Prim ( -- a decimal number with UTF-8 encoded characters. , charUtf8 + , cstring + , cstringUtf8 + {- -- * Testing support -- | The following four functions are intended for testing use @@ -472,6 +476,7 @@ import Data.ByteString.Builder.Prim.ASCII #if MIN_VERSION_base(4,4,0) #if MIN_VERSION_base(4,7,0) import Foreign +import Foreign.C.Types #else import Foreign hiding (unsafeForeignPtrToPtr) #endif @@ -479,6 +484,8 @@ import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr) #else import Foreign #endif +import GHC.Exts +import GHC.IO ------------------------------------------------------------------------------ -- Creating Builders from bounded primitives @@ -677,6 +684,60 @@ primMapLazyByteStringBounded w = L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty +------------------------------------------------------------------------------ +-- Raw CString encoding +------------------------------------------------------------------------------ + +#if !MIN_VERSION_base(4,7,0) +-- eqWord# et al. return Bools prior to GHC 7.6 +isTrue# :: Bool -> Bool +isTrue# x = x +#endif + +-- | A null-terminated ASCII encoded 'CString'. Null characters are not representable. +cstring :: Addr# -> Builder +cstring = + \addr0 -> builder $ step addr0 + where + step :: Addr# -> BuildStep r -> BuildStep r + step !addr !k !br@(BufferRange op0@(Ptr op0#) ope) + | isTrue# (ch `eqWord#` 0##) = k br + | op0 == ope = + return $ bufferFull defaultChunkSize op0 (step addr k) + | otherwise = do + IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of + s' -> (# s', () #) + let br' = BufferRange (op0 `plusPtr` 1) ope + step (addr `plusAddr#` 1#) k br' + where + !ch = indexWord8OffAddr# addr 0# + +-- | A null-terminated UTF-8 encoded 'CString'. Null characters can be encoded as +-- @0xc0 0x80@. +cstringUtf8 :: Addr# -> Builder +cstringUtf8 = + \addr0 -> builder $ step addr0 + where + step :: Addr# -> BuildStep r -> BuildStep r + step !addr !k !br@(BufferRange op0@(Ptr op0#) ope) + | isTrue# (ch `eqWord#` 0##) = k br + | op0 == ope = + return $ bufferFull defaultChunkSize op0 (step addr k) + -- NULL is encoded as 0xc0 0x80 + | isTrue# (ch `eqWord#` 0xc0##) + , isTrue# (indexWord8OffAddr# addr 1# `eqWord#` 0x80##) = do + IO $ \s -> case writeWord8OffAddr# op0# 0# 0## s of + s' -> (# s', () #) + let br' = BufferRange (op0 `plusPtr` 1) ope + step (addr `plusAddr#` 1#) k br' + | otherwise = do + IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of + s' -> (# s', () #) + let br' = BufferRange (op0 `plusPtr` 1) ope + step (addr `plusAddr#` 1#) k br' + where + !ch = indexWord8OffAddr# addr 0# + ------------------------------------------------------------------------------ -- Char8 encoding ------------------------------------------------------------------------------ @@ -741,5 +802,3 @@ encodeCharUtf8 f1 f2 f3 f4 c = case ord c of x3 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80 x4 = fromIntegral $ (x .&. 0x3F) + 0x80 in f4 x1 x2 x3 x4 - - From 7c891d6842e1f0c319d4d9b02e906ff34129d51f Mon Sep 17 00:00:00 2001 From: Ben Gamari Date: Thu, 13 Jul 2017 00:18:14 -0400 Subject: [PATCH 4/8] Benchmark UTF-8 strings --- bench/BenchAll.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 8d9baf74d..263be7896 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -239,6 +239,8 @@ main = do [ benchB' "mempty" () (const mempty) , benchB' "ensureFree 8" () (const (ensureFree 8)) , benchB' "intHost 1" 1 intHost + , benchB' "UTF-8 String (naive)" "hello world\0" fromString + , benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"# , benchB' "String (naive)" "hello world!" fromString , benchB' "String" () $ \() -> P.cstring "hello world!"# ] From 083b2dbd57be48c05e706bfef677fe3e2176de86 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Fri, 21 Aug 2020 21:51:37 +0100 Subject: [PATCH 5/8] Test cstringUtf8 and encoding of NULL --- tests/builder/Data/ByteString/Builder/Prim/Tests.hs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs index 8c3cc828c..b3428ff26 100644 --- a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs @@ -29,12 +29,17 @@ import TestFramework tests :: [Test] tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8 - , testsCombinatorsB, [testCString] ] + , testsCombinatorsB, [testCString, testCStringUtf8] ] testCString :: Test testCString = testProperty "cstring" $ toLazyByteString (BP.cstring "hello world!"#) == LC.pack "hello world!" +testCStringUtf8 :: Test +testCStringUtf8 = testProperty "cstringUtf8" $ + toLazyByteString (BP.cstringUtf8 "\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82\x2c\x20\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\x21"#) == + toLazyByteString (stringUtf8 "Привет, \0мир!") + ------------------------------------------------------------------------------ -- Binary ------------------------------------------------------------------------------ From 270bb48f10eb751600b95c9e22e017d2d31ae189 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Wed, 26 Aug 2020 00:27:54 +0100 Subject: [PATCH 6/8] Really test encoding of NULL --- tests/builder/Data/ByteString/Builder/Prim/Tests.hs | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs index b3428ff26..ce77453d1 100644 --- a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs @@ -33,12 +33,13 @@ tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8 testCString :: Test testCString = testProperty "cstring" $ - toLazyByteString (BP.cstring "hello world!"#) == LC.pack "hello world!" + toLazyByteString (BP.cstring "hello world!"#) == + LC.pack "hello" <> L.singleton 0x20 <> LC.pack "world!" testCStringUtf8 :: Test testCStringUtf8 = testProperty "cstringUtf8" $ - toLazyByteString (BP.cstringUtf8 "\xd0\x9f\xd1\x80\xd0\xb8\xd0\xb2\xd0\xb5\xd1\x82\x2c\x20\xc0\x80\xd0\xbc\xd0\xb8\xd1\x80\x21"#) == - toLazyByteString (stringUtf8 "Привет, \0мир!") + toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) == + LC.pack "hello" <> L.singleton 0x00 <> LC.pack "world!" ------------------------------------------------------------------------------ -- Binary From 5c83a80c2d3d62151c2ee520687962145977c7bd Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Wed, 26 Aug 2020 01:04:26 +0100 Subject: [PATCH 7/8] Fix compatibility with older GHCs --- tests/builder/Data/ByteString/Builder/Prim/Tests.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs index ce77453d1..c68d63b3f 100644 --- a/tests/builder/Data/ByteString/Builder/Prim/Tests.hs +++ b/tests/builder/Data/ByteString/Builder/Prim/Tests.hs @@ -34,12 +34,12 @@ tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8 testCString :: Test testCString = testProperty "cstring" $ toLazyByteString (BP.cstring "hello world!"#) == - LC.pack "hello" <> L.singleton 0x20 <> LC.pack "world!" + LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!" testCStringUtf8 :: Test testCStringUtf8 = testProperty "cstringUtf8" $ toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) == - LC.pack "hello" <> L.singleton 0x00 <> LC.pack "world!" + LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!" ------------------------------------------------------------------------------ -- Binary From 6e29895ff54d5a6004bf6a713355e67e4ecd6c40 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Wed, 26 Aug 2020 01:11:58 +0100 Subject: [PATCH 8/8] Fix encoding of NULL --- Data/ByteString/Builder/Prim.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Data/ByteString/Builder/Prim.hs b/Data/ByteString/Builder/Prim.hs index 18f85c787..6ab93cd50 100644 --- a/Data/ByteString/Builder/Prim.hs +++ b/Data/ByteString/Builder/Prim.hs @@ -724,7 +724,7 @@ cstringUtf8 = IO $ \s -> case writeWord8OffAddr# op0# 0# 0## s of s' -> (# s', () #) let br' = BufferRange (op0 `plusPtr` 1) ope - step (addr `plusAddr#` 1#) k br' + step (addr `plusAddr#` 2#) k br' | otherwise = do IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of s' -> (# s', () #)