From 64782d1cf452afb53640ca8ae04091e8015e3e84 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 7 Aug 2021 23:49:38 +0100 Subject: [PATCH 1/4] Switch Data.ByteString.Short to reuse Data.Array.Byte --- .github/workflows/ci.yml | 5 +- Data/ByteString/Short/Internal.hs | 95 +++++++------------------------ bytestring.cabal | 3 + 3 files changed, 25 insertions(+), 78 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3c93fa520..5724e8849 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -133,10 +133,11 @@ jobs: githubToken: ${{ github.token }} install: | apt-get update -y - apt-get install -y ghc libghc-tasty-quickcheck-dev + apt-get install -y curl ghc libghc-tasty-quickcheck-dev run: | + curl -s https://hackage.haskell.org/package/data-array-byte-0.1/data-array-byte-0.1.tar.gz | tar xz ghc --version - ghc --make -Iinclude -itests:tests/builder -o Main cbits/*.c tests/Main.hs +RTS -s + ghc --make -Iinclude -itests:tests/builder:data-array-byte-0.1 -o Main cbits/*.c tests/Main.hs +RTS -s ./Main +RTS -s bounds-checking: diff --git a/Data/ByteString/Short/Internal.hs b/Data/ByteString/Short/Internal.hs index 8639a3e36..5283a6731 100644 --- a/Data/ByteString/Short/Internal.hs +++ b/Data/ByteString/Short/Internal.hs @@ -1,10 +1,13 @@ {-# LANGUAGE BangPatterns #-} {-# LANGUAGE CPP #-} {-# LANGUAGE DeriveDataTypeable #-} +{-# LANGUAGE DeriveLift #-} {-# LANGUAGE ForeignFunctionInterface #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PatternSynonyms #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TemplateHaskellQuotes #-} @@ -13,6 +16,7 @@ {-# LANGUAGE UnboxedTuples #-} {-# LANGUAGE UnliftedFFITypes #-} {-# LANGUAGE Unsafe #-} + {-# OPTIONS_HADDOCK not-home #-} {-# OPTIONS_GHC -fno-warn-name-shadowing -fexpose-all-unfoldings #-} @@ -40,7 +44,7 @@ module Data.ByteString.Short.Internal ( -- * The @ShortByteString@ type and representation - ShortByteString(..), + ShortByteString(.., SBS), -- * Introducing and eliminating 'ShortByteString's empty, @@ -162,6 +166,8 @@ import Data.ByteString.Internal , checkedAdd ) +import Data.Array.Byte + ( ByteArray(..) ) import Data.Bits ( FiniteBits (finiteBitSize) , shiftL @@ -172,21 +178,17 @@ import Data.Bits , (.|.) ) import Data.Data - ( Data(..) - , mkNoRepType - ) + ( Data(..) ) import Data.Monoid ( Monoid(..) ) import Data.Semigroup - ( Semigroup((<>)) ) + ( Semigroup ) import Data.String ( IsString(..) ) -import Data.Typeable - ( Typeable ) import Control.Applicative ( pure ) import Control.DeepSeq - ( NFData(..) ) + ( NFData ) import Control.Exception ( assert ) import Control.Monad @@ -269,7 +271,6 @@ import qualified Data.ByteString.Internal as BS import qualified Data.List as List import qualified GHC.Exts -import qualified Language.Haskell.TH.Lib as TH import qualified Language.Haskell.TH.Syntax as TH -- | A compact representation of a 'Word8' vector. @@ -279,54 +280,17 @@ import qualified Language.Haskell.TH.Syntax as TH -- 'ByteString' (at the cost of copying the string data). It supports very few -- other operations. -- -data ShortByteString = SBS ByteArray# - deriving Typeable - --- | @since 0.11.2.0 -instance TH.Lift ShortByteString where -#if MIN_VERSION_template_haskell(2,16,0) - lift sbs = [| unsafePackLenLiteral |] - `TH.appE` TH.litE (TH.integerL (fromIntegral len)) - `TH.appE` TH.litE (TH.BytesPrimL $ TH.Bytes ptr 0 (fromIntegral len)) - where - BS ptr len = fromShort sbs -#else - lift sbs = [| unsafePackLenLiteral |] - `TH.appE` TH.litE (TH.integerL (fromIntegral len)) - `TH.appE` TH.litE (TH.StringPrimL $ BS.unpackBytes bs) - where - bs@(BS _ len) = fromShort sbs -#endif - -#if MIN_VERSION_template_haskell(2,17,0) - liftTyped = TH.unsafeCodeCoerce . TH.lift -#elif MIN_VERSION_template_haskell(2,16,0) - liftTyped = TH.unsafeTExpCoerce . TH.lift -#endif - --- The ByteArray# representation is always word sized and aligned but with a --- known byte length. Our representation choice for ShortByteString is to leave --- the 0--3 trailing bytes undefined. This means we can use word-sized writes, --- but we have to be careful with reads, see equateBytes and compareBytes below. - +newtype ShortByteString = ShortByteString { unShortByteString :: ByteArray } + deriving (Eq, Semigroup, Monoid, TH.Lift, Data, NFData) -instance Eq ShortByteString where - (==) = equateBytes +pattern SBS :: ByteArray# -> ShortByteString +pattern SBS x = ShortByteString (ByteArray x) +{-# COMPLETE SBS #-} +-- | Lexicographic order. instance Ord ShortByteString where compare = compareBytes -instance Semigroup ShortByteString where - (<>) = append - -instance Monoid ShortByteString where - mempty = empty - mappend = (<>) - mconcat = concat - -instance NFData ShortByteString where - rnf SBS{} = () - instance Show ShortByteString where showsPrec p ps r = showsPrec p (unpackChars ps) r @@ -336,20 +300,15 @@ instance Read ShortByteString where -- | @since 0.10.12.0 instance GHC.Exts.IsList ShortByteString where type Item ShortByteString = Word8 - fromList = packBytes - toList = unpack + fromList = ShortByteString . GHC.Exts.fromList + fromListN = (ShortByteString .) . GHC.Exts.fromListN + toList = GHC.Exts.toList . unShortByteString -- | Beware: 'fromString' truncates multi-byte characters to octets. -- e.g. "枯朶に烏のとまりけり秋の暮" becomes �6k�nh~�Q��n� instance IsString ShortByteString where fromString = packChars -instance Data ShortByteString where - gfoldl f z txt = z packBytes `f` unpack txt - toConstr _ = error "Data.ByteString.Short.ShortByteString.toConstr" - gunfold _ _ = error "Data.ByteString.Short.ShortByteString.gunfold" - dataTypeOf _ = mkNoRepType "Data.ByteString.Short.ShortByteString" - ------------------------------------------------------------------------ -- Simple operations @@ -402,12 +361,6 @@ indexError sbs i = moduleError "index" $ "error in array index: " ++ show i ++ " not in range [0.." ++ show (length sbs) ++ "]" --- | @since 0.11.2.0 -unsafePackLenLiteral :: Int -> Addr# -> ShortByteString -unsafePackLenLiteral len addr# = - -- createFromPtr allocates, so accursedUnutterablePerformIO is wrong - unsafeDupablePerformIO $ createFromPtr (Ptr addr#) len - ------------------------------------------------------------------------ -- Internal utils @@ -642,13 +595,6 @@ unpackAppendBytesStrict !sbs off len = go (off-1) (off-1 + len) ------------------------------------------------------------------------ -- Eq and Ord implementations -equateBytes :: ShortByteString -> ShortByteString -> Bool -equateBytes sbs1 sbs2 = - let !len1 = length sbs1 - !len2 = length sbs2 - in len1 == len2 - && 0 == compareByteArrays (asBA sbs1) (asBA sbs2) len1 - compareBytes :: ShortByteString -> ShortByteString -> Ordering compareBytes sbs1 sbs2 = let !len1 = length sbs1 @@ -661,7 +607,6 @@ compareBytes sbs1 sbs2 = | len2 < len1 -> GT | otherwise -> EQ - ------------------------------------------------------------------------ -- Appending and concatenation @@ -1597,8 +1542,6 @@ findIndices k = \sbs -> | otherwise = go (n + 1) in go 0 - - ------------------------------------------------------------------------ -- Exported low level operations diff --git a/bytestring.cabal b/bytestring.cabal index fd5656fc4..bd47cfc9a 100644 --- a/bytestring.cabal +++ b/bytestring.cabal @@ -74,6 +74,9 @@ source-repository head library build-depends: base >= 4.9 && < 5, ghc-prim, deepseq, template-haskell + if impl(ghc < 9.4) + build-depends: data-array-byte >= 0.1 && < 0.2 + exposed-modules: Data.ByteString Data.ByteString.Char8 Data.ByteString.Unsafe From 3cecd12bf877d0b15f59dc682e88e54e85192c57 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Fri, 7 Oct 2022 22:42:48 +0100 Subject: [PATCH 2/4] Improve documentation --- Data/ByteString/Short/Internal.hs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Data/ByteString/Short/Internal.hs b/Data/ByteString/Short/Internal.hs index 5283a6731..5cd5b85f9 100644 --- a/Data/ByteString/Short/Internal.hs +++ b/Data/ByteString/Short/Internal.hs @@ -280,9 +280,16 @@ import qualified Language.Haskell.TH.Syntax as TH -- 'ByteString' (at the cost of copying the string data). It supports very few -- other operations. -- -newtype ShortByteString = ShortByteString { unShortByteString :: ByteArray } +newtype ShortByteString = + -- | @since 0.12.0.0 + ShortByteString + { unShortByteString :: ByteArray + -- ^ @since 0.12.0.0 + } deriving (Eq, Semigroup, Monoid, TH.Lift, Data, NFData) +-- | Prior to @bytestring-0.12@ 'SBS' was a genuine constructor of 'ShortByteString', +-- but now it is a bundled pattern synonym, provided as a compatibility shim. pattern SBS :: ByteArray# -> ShortByteString pattern SBS x = ShortByteString (ByteArray x) {-# COMPLETE SBS #-} From bef91621bff2de70495496cbb8ea5723012954c2 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 8 Oct 2022 00:11:04 +0100 Subject: [PATCH 3/4] Update changelog --- Changelog.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Changelog.md b/Changelog.md index fd24dbe85..78e3b7a66 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,8 @@ * [`readInt` returns `Nothing`, if the sequence of digits cannot be represented by an `Int`, instead of overflowing silently](https://github.com/haskell/bytestring/pull/309) * [Remove `zipWith` rewrite rule](https://github.com/haskell/bytestring/pull/387) * [Export `unsafeIndex` for ShortByteString which had been accidentally removed in v0.11.3.0](https://github.com/haskell/bytestring/pull/532) +* [`ShortByteString` is now a wrapper over boxed `Data.Array.Byte.ByteArray` instead of unboxed `ByteArray#` directly](https://github.com/haskell/bytestring/pull/410) +* [`fromListN` from `instance IsList ShortByteString` throws an exception if the first argument does not match the length of the second instead of silent ignore](https://github.com/haskell/bytestring/pull/410) [0.12.0.0]: https://github.com/haskell/bytestring/compare/0.11.3.0...0.12.0.0 From 5a8d7189d4f1493724eadb6c33c64d02b8e35be8 Mon Sep 17 00:00:00 2001 From: Bodigrim Date: Sat, 8 Oct 2022 14:05:32 +0100 Subject: [PATCH 4/4] Restore our own instances for Semigroup and Monoid, they are safer w.r.t. overflows --- Data/ByteString/Short/Internal.hs | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/Data/ByteString/Short/Internal.hs b/Data/ByteString/Short/Internal.hs index 5cd5b85f9..c29582e11 100644 --- a/Data/ByteString/Short/Internal.hs +++ b/Data/ByteString/Short/Internal.hs @@ -182,7 +182,7 @@ import Data.Data import Data.Monoid ( Monoid(..) ) import Data.Semigroup - ( Semigroup ) + ( Semigroup((<>)) ) import Data.String ( IsString(..) ) import Control.Applicative @@ -286,7 +286,7 @@ newtype ShortByteString = { unShortByteString :: ByteArray -- ^ @since 0.12.0.0 } - deriving (Eq, Semigroup, Monoid, TH.Lift, Data, NFData) + deriving (Eq, TH.Lift, Data, NFData) -- | Prior to @bytestring-0.12@ 'SBS' was a genuine constructor of 'ShortByteString', -- but now it is a bundled pattern synonym, provided as a compatibility shim. @@ -298,6 +298,21 @@ pattern SBS x = ShortByteString (ByteArray x) instance Ord ShortByteString where compare = compareBytes +-- Instead of deriving Semigroup / Monoid , we stick to our own implementations +-- of mappend / mconcat, because they are safer with regards to overflows +-- (see prop_32bitOverflow_Short_mconcat test). +-- ByteArray is likely to catch up starting from GHC 9.6: +-- * https://gitlab.haskell.org/ghc/ghc/-/merge_requests/8272 +-- * https://gitlab.haskell.org/ghc/ghc/-/merge_requests/9128 + +instance Semigroup ShortByteString where + (<>) = append + +instance Monoid ShortByteString where + mempty = empty + mappend = (<>) + mconcat = concat + instance Show ShortByteString where showsPrec p ps r = showsPrec p (unpackChars ps) r