Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC] Builder: Efficiently handle literal strings #132

Merged
merged 9 commits into from
Aug 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions Data/ByteString/Builder.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{-# LANGUAGE CPP, BangPatterns #-}
{-# LANGUAGE CPP, BangPatterns, MagicHash #-}
{-# OPTIONS_GHC -fno-warn-unused-imports -fno-warn-orphans #-}
#if __GLASGOW_HASKELL__ >= 701
{-# LANGUAGE Trustworthy #-}
Expand Down Expand Up @@ -263,6 +263,8 @@ import Data.ByteString.Builder.ASCII
import Data.String (IsString(..))
import System.IO (Handle)
import Foreign
import GHC.Base (unpackCString#, unpackCStringUtf8#,
unpackFoldrCString#, build)

-- HADDOCK only imports
import qualified Data.ByteString as S (concat)
Expand Down Expand Up @@ -432,10 +434,20 @@ char8 :: Char -> Builder
char8 = P.primFixed P.char8

-- | Char8 encode a 'String'.
{-# INLINE string8 #-}
{-# INLINE [1] string8 #-} -- phased to allow P.cstring rewrite
string8 :: String -> Builder
string8 = P.primMapListFixed P.char8

-- GHC desugars string literals with unpackCString# which the simplifier tends
-- to promptly turn into build (unpackFoldrCString# s), so we match on both.
{-# RULES
"string8/unpackCString#" forall s.
string8 (unpackCString# s) = P.cstring s

"string8/unpackFoldrCString#" forall s.
string8 (build (unpackFoldrCString# s)) = P.cstring s
#-}

------------------------------------------------------------------------------
-- UTF-8 encoding
------------------------------------------------------------------------------
Expand All @@ -446,9 +458,23 @@ charUtf8 :: Char -> Builder
charUtf8 = P.primBounded P.charUtf8

-- | UTF-8 encode a 'String'.
{-# INLINE stringUtf8 #-}
--
-- Note that 'stringUtf8' performs no codepoint validation and consequently may
-- emit invalid UTF-8 if asked (e.g. single surrogates).
{-# INLINE [1] stringUtf8 #-} -- phased to allow P.cstring rewrite
stringUtf8 :: String -> Builder
stringUtf8 = P.primMapListBounded P.charUtf8

{-# RULES
"stringUtf8/unpackCStringUtf8#" forall s.
stringUtf8 (unpackCStringUtf8# s) = P.cstringUtf8 s

"stringUtf8/unpackCString#" forall s.
stringUtf8 (unpackCString# s) = P.cstring s
sjakobi marked this conversation as resolved.
Show resolved Hide resolved

"stringUtf8/unpackFoldrCString#" forall s.
stringUtf8 (build (unpackFoldrCString# s)) = P.cstring s
#-}

instance IsString Builder where
fromString = stringUtf8
62 changes: 61 additions & 1 deletion Data/ByteString/Builder/Prim.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
{-# LANGUAGE CPP, BangPatterns, ScopedTypeVariables #-}
{-# LANGUAGE MagicHash, UnboxedTuples, PatternGuards #-}
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
#if __GLASGOW_HASKELL__ == 700
-- This is needed as a workaround for an old bug in GHC 7.0.1 (Trac #4498)
Expand Down Expand Up @@ -438,6 +439,9 @@ module Data.ByteString.Builder.Prim (
-- a decimal number with UTF-8 encoded characters.
, charUtf8

, cstring
, cstringUtf8

{-
-- * Testing support
-- | The following four functions are intended for testing use
Expand Down Expand Up @@ -473,13 +477,16 @@ import Data.ByteString.Builder.Prim.ASCII
#if MIN_VERSION_base(4,4,0)
#if MIN_VERSION_base(4,7,0)
import Foreign
import Foreign.C.Types
#else
import Foreign hiding (unsafeForeignPtrToPtr)
#endif
import Foreign.ForeignPtr.Unsafe (unsafeForeignPtrToPtr)
#else
import Foreign
#endif
import GHC.Exts
import GHC.IO

------------------------------------------------------------------------------
-- Creating Builders from bounded primitives
Expand Down Expand Up @@ -672,6 +679,60 @@ primMapLazyByteStringBounded w =
L.foldrChunks (\x b -> primMapByteStringBounded w x `mappend` b) mempty


------------------------------------------------------------------------------
-- Raw CString encoding
------------------------------------------------------------------------------

#if !MIN_VERSION_base(4,7,0)
-- eqWord# et al. return Bools prior to GHC 7.6
isTrue# :: Bool -> Bool
isTrue# x = x
#endif

-- | A null-terminated ASCII encoded 'CString'. Null characters are not representable.
cstring :: Addr# -> Builder
cstring =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k !br@(BufferRange op0@(Ptr op0#) ope)
| isTrue# (ch `eqWord#` 0##) = k br
| op0 == ope =
return $ bufferFull defaultChunkSize op0 (step addr k)
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#

-- | A null-terminated UTF-8 encoded 'CString'. Null characters can be encoded as
-- @0xc0 0x80@.
cstringUtf8 :: Addr# -> Builder
cstringUtf8 =
\addr0 -> builder $ step addr0
where
step :: Addr# -> BuildStep r -> BuildStep r
step !addr !k !br@(BufferRange op0@(Ptr op0#) ope)
| isTrue# (ch `eqWord#` 0##) = k br
| op0 == ope =
return $ bufferFull defaultChunkSize op0 (step addr k)
-- NULL is encoded as 0xc0 0x80
| isTrue# (ch `eqWord#` 0xc0##)
, isTrue# (indexWord8OffAddr# addr 1# `eqWord#` 0x80##) = do
IO $ \s -> case writeWord8OffAddr# op0# 0# 0## s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 2#) k br'
| otherwise = do
IO $ \s -> case writeWord8OffAddr# op0# 0# ch s of
s' -> (# s', () #)
let br' = BufferRange (op0 `plusPtr` 1) ope
step (addr `plusAddr#` 1#) k br'
where
!ch = indexWord8OffAddr# addr 0#

------------------------------------------------------------------------------
-- Char8 encoding
------------------------------------------------------------------------------
Expand Down Expand Up @@ -736,4 +797,3 @@ encodeCharUtf8 f1 f2 f3 f4 c = case ord c of
x3 = fromIntegral $ ((x `shiftR` 6) .&. 0x3F) + 0x80
x4 = fromIntegral $ (x .&. 0x3F) + 0x80
in f4 x1 x2 x3 x4

8 changes: 8 additions & 0 deletions bench/BenchAll.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{-# LANGUAGE BangPatterns #-}
{-# LANGUAGE PackageImports #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE MagicHash #-}
-- |
-- Copyright : (c) 2011 Simon Meier
-- License : BSD3-style (see LICENSE)
Expand All @@ -14,6 +15,7 @@ module Main (main) where

import Data.Foldable (foldMap)
import Data.Monoid
import Data.String
import Gauge
import Prelude hiding (words)

Expand All @@ -36,6 +38,8 @@ import qualified Blaze.Text as Blaze
import qualified "bytestring" Data.ByteString as OldS
import qualified "bytestring" Data.ByteString.Lazy as OldL

import Paths_bench_bytestring

import Foreign

import System.Random
Expand Down Expand Up @@ -236,6 +240,10 @@ main = do
[ benchB' "mempty" () (const mempty)
, benchB' "ensureFree 8" () (const (ensureFree 8))
, benchB' "intHost 1" 1 intHost
, benchB' "UTF-8 String (naive)" "hello world\0" fromString
, benchB' "UTF-8 String" () $ \() -> P.cstringUtf8 "hello world\0"#
, benchB' "String (naive)" "hello world!" fromString
, benchB' "String" () $ \() -> P.cstring "hello world!"#
]

, bgroup "Encoding wrappers"
Expand Down
15 changes: 13 additions & 2 deletions tests/builder/Data/ByteString/Builder/Prim/Tests.hs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{-# LANGUAGE CPP, ScopedTypeVariables #-}
{-# LANGUAGE CPP, ScopedTypeVariables, MagicHash #-}

-- |
-- Copyright : (c) 2011 Simon Meier
Expand All @@ -14,21 +14,32 @@ module Data.ByteString.Builder.Prim.Tests (tests) where

import Data.Char (ord)
import qualified Data.ByteString.Lazy as L
import qualified Data.ByteString.Lazy.Char8 as LC
import Data.ByteString.Builder
import qualified Data.ByteString.Builder.Prim as BP
import Data.ByteString.Builder.Prim.TestUtils

#if defined(HAVE_TEST_FRAMEWORK)
import Test.Framework
import Test.Framework.Providers.QuickCheck2
#else
import TestFramework
#endif


tests :: [Test]
tests = concat [ testsBinary, testsASCII, testsChar8, testsUtf8
, testsCombinatorsB ]
, testsCombinatorsB, [testCString, testCStringUtf8] ]

testCString :: Test
testCString = testProperty "cstring" $
toLazyByteString (BP.cstring "hello world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x20 `L.append` LC.pack "world!"

testCStringUtf8 :: Test
testCStringUtf8 = testProperty "cstringUtf8" $
toLazyByteString (BP.cstringUtf8 "hello\xc0\x80world!"#) ==
LC.pack "hello" `L.append` L.singleton 0x00 `L.append` LC.pack "world!"

------------------------------------------------------------------------------
-- Binary
Expand Down