From 687ac2e4ca123e50968194e60b05bec5580f1d67 Mon Sep 17 00:00:00 2001 From: Viktor Dukhovni Date: Thu, 11 Nov 2021 20:51:39 -0500 Subject: [PATCH] New sized and unsigned variants of reaInt/Integer Some applications want to read either unsigned or explicitly 64-bit integers (e.g. warp). Provide all the missing overflow-checked interfaces. * readInt8, readInt16, readInt32, readInt64 * readWord, readWord8, readWord16, readWord32, readWord64 * readNatural Cleaned up the code and improved tests. Uses Word as the accumular for all types other than Int64 and Word64, which use Word64. When words are 64 bit uses base 10^19 rather than 10^9 when assembling Natural and Integer values. --- Changelog.md | 1 + Data/ByteString/Char8.hs | 146 ++---------------- Data/ByteString/Internal.hs | 24 +-- Data/ByteString/Lazy/Char8.hs | 183 +++------------------- Data/ByteString/Lazy/ReadInt.hs | 264 ++++++++++++++++++++++++++++++++ Data/ByteString/Lazy/ReadNat.hs | 258 +++++++++++++++++++++++++++++++ Data/ByteString/ReadInt.hs | 3 + Data/ByteString/ReadNat.hs | 3 + bench/BenchAll.hs | 2 + bench/BenchReadInt.hs | 144 +++++++++++++++++ bytestring.cabal | 5 + tests/Properties.hs | 197 ++++++++++++++++-------- tests/Properties/ByteString.hs | 131 ++++++++++++++-- 13 files changed, 978 insertions(+), 383 deletions(-) create mode 100644 Data/ByteString/Lazy/ReadInt.hs create mode 100644 Data/ByteString/Lazy/ReadNat.hs create mode 100644 Data/ByteString/ReadInt.hs create mode 100644 Data/ByteString/ReadNat.hs create mode 100644 bench/BenchReadInt.hs diff --git a/Changelog.md b/Changelog.md index b2d82edb8..5d8a3d7f0 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,5 +1,6 @@ [0.12.0.0] — Unreleased +* [New sized and/or unsigned variants of `readInt` and `readInteger`](https://github.com/haskell/bytestring/pull/438) * [`readInt` returns `Nothing`, if the sequence of digits cannot be represented by an `Int`, instead of overflowing silently](https://github.com/haskell/bytestring/pull/309) * [Remove `zipWith` rewrite rule](https://github.com/haskell/bytestring/pull/387) diff --git a/Data/ByteString/Char8.hs b/Data/ByteString/Char8.hs index 9e1bd71bc..f4cfc3285 100644 --- a/Data/ByteString/Char8.hs +++ b/Data/ByteString/Char8.hs @@ -185,7 +185,19 @@ module Data.ByteString.Char8 ( -- * Reading from ByteStrings readInt, + readInt64, + readInt32, + readInt16, + readInt8, + + readWord, + readWord64, + readWord32, + readWord16, + readWord8, + readInteger, + readNatural, -- * Low level CString conversions @@ -264,6 +276,8 @@ import Data.ByteString (empty,null,length,tail,init,append ) import Data.ByteString.Internal +import Data.ByteString.ReadInt +import Data.ByteString.ReadNat import Data.Char ( isSpace ) -- See bytestring #70 @@ -843,14 +857,6 @@ unzip :: [(Char,Char)] -> (ByteString,ByteString) unzip ls = (pack (P.map fst ls), pack (P.map snd ls)) {-# INLINE unzip #-} --- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits --- the check for the empty case, which is good for performance, but --- there is an obligation on the programmer to provide a proof that the --- ByteString is non-empty. -unsafeHead :: ByteString -> Char -unsafeHead = w2c . B.unsafeHead -{-# INLINE unsafeHead #-} - -- --------------------------------------------------------------------- -- Things that depend on the encoding @@ -977,130 +983,6 @@ unwords :: [ByteString] -> ByteString unwords = intercalate (singleton ' ') {-# INLINE unwords #-} --- --------------------------------------------------------------------- --- Reading from ByteStrings - --- | Try to read an 'Int' value from the 'ByteString', returning @Just (val, --- str)@ on success, where @val@ is the value read and @str@ is the rest of the --- input string. If the sequence of digits decodes to a value larger than can --- be represented by an 'Int', the returned value will be 'Nothing'. --- --- 'readInt' does not ignore leading whitespace, the value must start --- immediately at the beginning of the input stream. --- --- ==== __Examples__ --- >>> readInt "-1729 = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3" --- Just (-1729," = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3") --- >>> readInt "not a decimal number") --- Nothing --- >>> readInt "12345678901234567890 overflows maxBound") --- Nothing --- >>> readInt "-12345678901234567890 underflows minBound") --- Nothing --- -readInt :: ByteString -> Maybe (Int, ByteString) -{-# INLINABLE readInt #-} -readInt bs = case B.uncons bs of - Just (w, rest) | w - 0x30 <= 9 -> readDec True bs -- starts with digit - | w == 0x2d -> readDec False rest -- starts with minus - | w == 0x2b -> readDec True rest -- starts with plus - _ -> Nothing -- not signed decimal - where - -- | Read a decimal 'Int' without overflow. The caller has already - -- read any explicit sign (setting @positive@ to 'False' as needed). - -- Here we just deal with the digits. - {-# INLINE readDec #-} - readDec !positive (B.BS fp len) = B.accursedUnutterablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> do - let end = ptr `plusPtr` len - (!n, !a, !inRange) <- if positive - then digits intmaxQuot10 intmaxRem10 end ptr 0 0 - else digits intminQuot10 intminRem10 end ptr 0 0 - if inRange - then if n < len - then let rest = B.BS (fp `B.plusForeignPtr` n) (len - n) - in return $! result n a rest - else return $! result n a B.empty - else return Nothing - where - -- | Process as many digits as we can, returning the additional - -- number of digits found, the final accumulator, and whether - -- the input decimal did not overflow prior to processing all - -- the provided digits (end of input or non-digit encountered). - digits !maxq !maxr !e !ptr = go ptr - where - go :: Ptr Word8 -> Int -> Word -> IO (Int, Word, Bool) - go !p !b !a | p == e = return (b, a, True) - go !p !b !a = do - !w <- fromIntegral <$> peek p - let !d = w - 0x30 - if d > 9 -- No more digits - then return (b, a, True) - else if a < maxq -- Look for more - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else if a > maxq -- overflow - then return (b, a, False) - else if d <= maxr -- Ideally this will be the last digit - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else return (b, a, False) -- overflow - - -- | Plausible success, provided we got at least one digit! - result !nbytes !acc str - | nbytes > 0 = let !i = w2int acc in Just (i, str) - | otherwise = Nothing - - -- This assumes that @negate . fromIntegral@ correctly produces - -- @minBound :: Int@ when given its positive 'Word' value as an - -- input. This is true in both 2s-complement and 1s-complement - -- arithmetic, so seems like a safe bet. Tests cover this case, - -- though the CI may not run on sufficiently exotic CPUs. - w2int !n | positive = fromIntegral n - | otherwise = negate $! fromIntegral n - --- | readInteger reads an Integer from the beginning of the ByteString. If --- there is no integer at the beginning of the string, it returns Nothing, --- otherwise it just returns the int read, and the rest of the string. -readInteger :: ByteString -> Maybe (Integer, ByteString) -readInteger as - | null as = Nothing - | otherwise = - case unsafeHead as of - '-' -> first (B.unsafeTail as) >>= \(n, bs) -> return (-n, bs) - '+' -> first (B.unsafeTail as) - _ -> first as - - where first ps | null ps = Nothing - | otherwise = - case B.unsafeHead ps of - w | w >= 0x30 && w <= 0x39 -> Just $ - loop 1 (fromIntegral w - 0x30) [] (B.unsafeTail ps) - | otherwise -> Nothing - - loop :: Int -> Int -> [Integer] - -> ByteString -> (Integer, ByteString) - loop !d !acc ns !ps - | null ps = combine d acc ns empty - | otherwise = - case B.unsafeHead ps of - w | w >= 0x30 && w <= 0x39 -> - if d == 9 then loop 1 (fromIntegral w - 0x30) - (toInteger acc : ns) - (B.unsafeTail ps) - else loop (d+1) - (10*acc + (fromIntegral w - 0x30)) - ns (B.unsafeTail ps) - | otherwise -> combine d acc ns ps - - combine _ acc [] ps = (toInteger acc, ps) - combine d acc ns ps = - (10^d * combine1 1000000000 ns + toInteger acc, ps) - - combine1 _ [n] = n - combine1 b ns = combine1 (b*b) $ combine2 b ns - - combine2 b (n:m:ns) = let !t = m*b + n in t : combine2 b ns - combine2 _ ns = ns - ------------------------------------------------------------------------ -- For non-binary text processing: diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs index 0efb1b7af..eb2d3a7a0 100644 --- a/Data/ByteString/Internal.hs +++ b/Data/ByteString/Internal.hs @@ -92,13 +92,7 @@ module Data.ByteString.Internal ( -- * Exported compatibility shim plusForeignPtr, - unsafeWithForeignPtr, - - -- * Internal constants - intmaxQuot10, - intmaxRem10, - intminQuot10, - intminRem10 + unsafeWithForeignPtr ) where import Prelude hiding (concat, null) @@ -786,22 +780,6 @@ isSpaceChar8 = isSpaceWord8 . c2w overflowError :: String -> a overflowError fun = error $ "Data.ByteString." ++ fun ++ ": size overflow" --- | Bounds for Word# multiplication by 10 without overflow, and --- absolute values of Int bounds. -intmaxWord, intminWord, intmaxQuot10, intmaxRem10, intminQuot10, intminRem10 :: Word -intmaxWord = fromIntegral (maxBound :: Int) -{-# INLINE intmaxWord #-} -intminWord = fromIntegral (negate (minBound :: Int)) -{-# INLINE intminWord #-} -intmaxQuot10 = intmaxWord `quot` 10 -{-# INLINE intmaxQuot10 #-} -intmaxRem10 = intmaxWord `rem` 10 -{-# INLINE intmaxRem10 #-} -intminQuot10 = intminWord `quot` 10 -{-# INLINE intminQuot10 #-} -intminRem10 = intminWord `rem` 10 -{-# INLINE intminRem10 #-} - ------------------------------------------------------------------------ -- | This \"function\" has a superficial similarity to 'System.IO.Unsafe.unsafePerformIO' but diff --git a/Data/ByteString/Lazy/Char8.hs b/Data/ByteString/Lazy/Char8.hs index 97530e51b..c6f0d6a18 100644 --- a/Data/ByteString/Lazy/Char8.hs +++ b/Data/ByteString/Lazy/Char8.hs @@ -1,6 +1,7 @@ {-# LANGUAGE BangPatterns #-} -{-# OPTIONS_HADDOCK prune #-} {-# LANGUAGE Trustworthy #-} +{-# LANGUAGE TypeApplications #-} +{-# OPTIONS_HADDOCK prune #-} -- | -- Module : Data.ByteString.Lazy.Char8 @@ -177,8 +178,28 @@ module Data.ByteString.Lazy.Char8 ( copy, -- * Reading from ByteStrings + -- | Note that a lazy 'ByteString' may hold an unbounded stream of + -- @\'0\'@ digits, in which case the functions below may never return. + -- If that's a concern, you can use 'take' to first truncate the input + -- to an acceptable length. Non-termination is also possible when + -- reading arbitrary precision numbers via 'readInteger' or + -- 'readNatural', if the input is an unbounded stream of arbitrary + -- decimal digits. + -- readInt, + readInt64, + readInt32, + readInt16, + readInt8, + + readWord, + readWord64, + readWord32, + readWord16, + readWord8, + readInteger, + readNatural, -- * I\/O with 'ByteString's -- | ByteString I/O uses binary mode, without any character decoding @@ -223,19 +244,15 @@ import Data.ByteString.Lazy import qualified Data.ByteString.Lazy as L import qualified Data.ByteString as S (ByteString) -- typename only import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as BI import qualified Data.ByteString.Unsafe as B import Data.ByteString.Lazy.Internal +import Data.ByteString.Lazy.ReadInt +import Data.ByteString.Lazy.ReadNat -import Data.ByteString.Internal (c2w,w2c,isSpaceWord8 - ,intmaxQuot10,intmaxRem10 - ,intminQuot10,intminRem10) +import Data.ByteString.Internal (c2w,w2c,isSpaceWord8) import Data.Int (Int64) -import Data.Word import qualified Data.List as List -import Foreign.Ptr (Ptr, plusPtr) -import Foreign.Storable (peek) import Prelude hiding (reverse,head,tail,last,init,null,length,map,lines,foldl,foldr,unlines @@ -912,156 +929,6 @@ unwords :: [ByteString] -> ByteString unwords = intercalate (singleton ' ') {-# INLINE unwords #-} --- | Try to read an 'Int' value from the 'ByteString', returning @Just (val, --- str)@ on success, where @val@ is the value read and @str@ is the rest of the --- input string. If the sequence of digits decodes to a value larger than can --- be represented by an 'Int', the returned value will be 'Nothing'. --- --- Note that a lazy 'ByteString' may, after an optional plus or minus sign, --- consist of an unbounded stream of @0@ digits, in which case 'readInt' --- would diverge (never return). If that's a concern, you can use 'take' to --- obtain a bounded initial segment to pass to 'readInt' instead. --- --- 'readInt' does not ignore leading whitespace, the value must start --- immediately at the beginning of the input stream. --- --- ==== __Examples__ --- >>> readInt "-1729 = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3" --- Just (-1729," = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3") --- >>> readInt "not a decimal number") --- Nothing --- >>> readInt "12345678901234567890 overflows maxBound") --- Nothing --- >>> readInt "-12345678901234567890 underflows minBound") --- Nothing --- -readInt :: ByteString -> Maybe (Int, ByteString) -{-# INLINABLE readInt #-} -readInt bs = case L.uncons bs of - Just (w, rest) | w - 0x30 <= 9 -> readDec True bs -- starts with digit - | w == 0x2d -> readDec False rest -- starts with minus - | w == 0x2b -> readDec True rest -- starts with plus - _ -> Nothing -- not signed decimal - where - - -- | Read a decimal 'Int' without overflow. The caller has already - -- read any explicit sign (setting @positive@ to 'False' as needed). - -- Here we just deal with the digits. - {-# INLINE readDec #-} - readDec !positive = loop 0 0 - where - loop !nbytes !acc = \ str -> case str of - Empty -> result nbytes acc str - Chunk c cs -> case B.length c of - 0 -> loop nbytes acc cs -- skip empty segment - l -> case accumWord acc c of - (0, !_, !inrange) -- no more digits or overflow - | inrange -> result nbytes acc str - | otherwise -> Nothing - (!n, !a, !inrange) - | not inrange -> Nothing - | n < l -- input not entirely digits - -> result (nbytes + n) a $ Chunk (B.drop n c) cs - | otherwise - -- read more digits from the remaining chunks - -> loop (nbytes + n) a cs - - -- | Process as many digits as we can, returning the additional - -- number of digits found, the updated accumulator, and whether - -- the input decimal did not overflow prior to processing all - -- the provided digits (end of input or non-digit encountered). - accumWord acc (BI.BS fp len) = - BI.accursedUnutterablePerformIO $ - BI.unsafeWithForeignPtr fp $ \ptr -> do - let end = ptr `plusPtr` len - x@(!_, !_, !_) <- if positive - then digits intmaxQuot10 intmaxRem10 end ptr 0 acc - else digits intminQuot10 intminRem10 end ptr 0 acc - return x - where - digits !maxq !maxr !e !ptr = go ptr - where - go :: Ptr Word8 -> Int -> Word -> IO (Int, Word, Bool) - go !p !b !a | p == e = return (b, a, True) - go !p !b !a = do - !w <- fromIntegral <$> peek p - let !d = w - 0x30 - if d > 9 -- No more digits - then return (b, a, True) - else if a < maxq -- Look for more - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else if a > maxq -- overflow - then return (b, a, False) - else if d <= maxr -- Ideally this will be the last digit - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else return (b, a, False) -- overflow - - -- | Plausible success, provided we got at least one digit! - result !nbytes !acc str - | nbytes > 0 = let !i = w2int acc in Just (i, str) - | otherwise = Nothing - - -- This assumes that @negate . fromIntegral@ correctly produces - -- @minBound :: Int@ when given its positive 'Word' value as an - -- input. This is true in both 2s-complement and 1s-complement - -- arithmetic, so seems like a safe bet. Tests cover this case, - -- though the CI may not run on sufficiently exotic CPUs. - w2int !n | positive = fromIntegral n - | otherwise = negate $! fromIntegral n - --- | readInteger reads an Integer from the beginning of the ByteString. If --- there is no integer at the beginning of the string, it returns Nothing, --- otherwise it just returns the int read, and the rest of the string. -readInteger :: ByteString -> Maybe (Integer, ByteString) -readInteger Empty = Nothing -readInteger (Chunk c0 cs0) = - case w2c (B.unsafeHead c0) of - '-' -> first (B.unsafeTail c0) cs0 >>= \(n, cs') -> return (-n, cs') - '+' -> first (B.unsafeTail c0) cs0 - _ -> first c0 cs0 - - where first c cs - | B.null c = case cs of - Empty -> Nothing - (Chunk c' cs') -> first' c' cs' - | otherwise = first' c cs - - first' c cs = case B.unsafeHead c of - w | w >= 0x30 && w <= 0x39 -> Just $ - loop 1 (fromIntegral w - 0x30) [] (B.unsafeTail c) cs - | otherwise -> Nothing - - loop :: Int -> Int -> [Integer] - -> S.ByteString -> ByteString -> (Integer, ByteString) - loop !d !acc ns !c cs - | B.null c = case cs of - Empty -> combine d acc ns c cs - (Chunk c' cs') -> loop d acc ns c' cs' - | otherwise = - case B.unsafeHead c of - w | w >= 0x30 && w <= 0x39 -> - if d < 9 then loop (d+1) - (10*acc + (fromIntegral w - 0x30)) - ns (B.unsafeTail c) cs - else loop 1 (fromIntegral w - 0x30) - (fromIntegral acc : ns) - (B.unsafeTail c) cs - | otherwise -> combine d acc ns c cs - - combine _ acc [] c cs = end (fromIntegral acc) c cs - combine d acc ns c cs = - end (10^d * combine1 1000000000 ns + fromIntegral acc) c cs - - combine1 _ [n] = n - combine1 b ns = combine1 (b*b) $ combine2 b ns - - combine2 b (n:m:ns) = let !t = n+m*b in t : combine2 b ns - combine2 _ ns = ns - - end n c cs = let !c' = chunk c cs - in (n, c') - - -- | Write a ByteString to a handle, appending a newline byte -- hPutStrLn :: Handle -> ByteString -> IO () diff --git a/Data/ByteString/Lazy/ReadInt.hs b/Data/ByteString/Lazy/ReadInt.hs new file mode 100644 index 000000000..65ef87382 --- /dev/null +++ b/Data/ByteString/Lazy/ReadInt.hs @@ -0,0 +1,264 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +-- This file is also included by "Data.ByteString.ReadInt", after defining +-- "BYTESTRING_STRICT". The two modules share much of their code, but +-- the lazy version adds an outer loop over the chunks. + +#ifdef BYTESTRING_STRICT +module Data.ByteString.ReadInt +#else +module Data.ByteString.Lazy.ReadInt +#endif + ( readInt + , readInt8 + , readInt16 + , readInt32 + , readWord + , readWord8 + , readWord16 + , readWord32 + , readInt64 + , readWord64 + ) where + +import qualified Data.ByteString.Internal as BI +#ifdef BYTESTRING_STRICT +import Data.ByteString +#else +import Data.ByteString.Lazy +import Data.ByteString.Lazy.Internal +#endif +import Data.Bits (FiniteBits, isSigned) +import Data.ByteString.Internal (pattern BS, plusForeignPtr) +import Data.Int +import Data.Word +import Foreign.ForeignPtr (ForeignPtr) +import Foreign.Ptr (minusPtr, plusPtr) +import Foreign.Storable (Storable(..)) + +----- Public API + +-- | Try to read a signed 'Int' value from the 'ByteString', returning +-- @Just (val, str)@ on success, where @val@ is the value read and @str@ is the +-- rest of the input string. If the sequence of digits decodes to a value +-- larger than can be represented by an 'Int', the returned value will be +-- 'Nothing'. +-- +-- 'readInt' does not ignore leading whitespace, the value must start +-- immediately at the beginning of the input string. +-- +-- ==== __Examples__ +-- >>> readInt "-1729 sum of cubes" +-- Just (-1729," sum of cubes") +-- >>> readInt "+1: readInt also accepts a leading '+'" +-- Just (1, ": readInt also accepts a leading '+'") +-- >>> readInt "not a decimal number") +-- Nothing +-- >>> readInt "12345678901234567890 overflows maxBound" +-- Nothing +-- >>> readInt "-12345678901234567890 underflows minBound" +-- Nothing +-- +readInt :: ByteString -> Maybe (Int, ByteString) +readInt = _read + +-- | A variant of 'readInt' specialised to 'Int32'. +readInt32 :: ByteString -> Maybe (Int32, ByteString) +readInt32 = _read + +-- | A variant of 'readInt' specialised to 'Int16'. +readInt16 :: ByteString -> Maybe (Int16, ByteString) +readInt16 = _read + +-- | A variant of 'readInt' specialised to 'Int8'. +readInt8 :: ByteString -> Maybe (Int8, ByteString) +readInt8 = _read + +-- | Try to read a 'Word' value from the 'ByteString', returning +-- @Just (val, str)@ on success, where @val@ is the value read and @str@ is the +-- rest of the input string. If the sequence of digits decodes to a value +-- larger than can be represented by a 'Word', the returned value will be +-- 'Nothing'. +-- +-- 'readWord' does not ignore leading whitespace, the value must start with a +-- decimal digit immediately at the beginning of the input string. Leading @+@ +-- signs are not accepted. +-- +-- ==== __Examples__ +-- >>> readWord "1729 sum of cubes" +-- Just (1729," sum of cubes") +-- >>> readWord "+1729 has an explicit sign" +-- Nothing +-- >>> readWord "not a decimal number" +-- Nothing +-- >>> readWord "98765432109876543210 overflows maxBound" +-- Nothing +-- +readWord :: ByteString -> Maybe (Word, ByteString) +readWord = _read + +-- | A variant of 'readWord' specialised to 'Word32'. +readWord32 :: ByteString -> Maybe (Word32, ByteString) +readWord32 = _read + +-- | A variant of 'readWord' specialised to 'Word16'. +readWord16 :: ByteString -> Maybe (Word16, ByteString) +readWord16 = _read + +-- | A variant of 'readWord' specialised to 'Word8'. +readWord8 :: ByteString -> Maybe (Word8, ByteString) +readWord8 = _read + +-- | A variant of 'readInt' specialised to 'Int64'. +readInt64 :: ByteString -> Maybe (Int64, ByteString) +readInt64 = _read + +-- | A variant of 'readWord' specialised to 'Word64'. +readWord64 :: ByteString -> Maybe (Word64, ByteString) +readWord64 = _read + +-- | Polymorphic Int*/Word* reader +_read :: forall a. (Integral a, FiniteBits a, Bounded a) + => ByteString -> Maybe (a, ByteString) +{-# INLINE _read #-} +_read + | isSigned @a 0 + = \ bs -> signed bs >>= \ (r, s, d1) -> _readDecimal r s d1 + | otherwise + -- When the input is @16^n-1@, as is the case with 'maxBound' for + -- all the Word* types, the last decimal digit of 'maxBound' is 5. + = \ bs -> unsigned 5 bs >>= \ (r, s, d1) -> _readDecimal r s d1 + where + -- Returns: + -- * Mod 10 min/max bound remainder + -- * 2nd and later digits + -- * 1st digit + -- + -- When the input is @8*16^n-1@, as is the case with 'maxBound' for + -- all the Int* types, the last decimal digit of 'maxBound' is 7. + -- + signed :: ByteString -> Maybe (Word64, ByteString, Word64) + signed bs = do + (w, s) <- uncons bs + let d1 = fromDigit w + if | d1 <= 9 -> Just (7, s, d1) -- leading digit + | w == 0x2d -> unsigned 8 s -- minus sign + | w == 0x2b -> unsigned 7 s -- plus sign + | otherwise -> Nothing -- not a number + + unsigned :: Word64 -> ByteString -> Maybe (Word64, ByteString, Word64) + unsigned r bs = do + (w, s) <- uncons bs + let d1 = fromDigit w + if | d1 <= 9 -> Just (r, s, d1) -- leading digit + | otherwise -> Nothing -- not a number + +----- Fixed-width unsigned reader + +-- | Intermediate result from scanning a chunk, final output is +-- converted to the requested type once all chunks are processed. +-- +data Result = Overflow + | Result !Int -- number of bytes (digits) read + !Word64 -- accumulator value + +_readDecimal :: forall a. (Integral a, Bounded a) + => Word64 -- ^ abs(maxBound/minBound) `mod` 10 + -> ByteString -- ^ Input string + -> Word64 -- ^ First digit value + -> Maybe (a, ByteString) +{-# INLINE _readDecimal #-} +_readDecimal !r = consume + where + consume :: ByteString -> Word64 -> Maybe (a, ByteString) +#ifdef BYTESTRING_STRICT + consume (BS fp len) a = case _digits q r fp len a of + Result used acc + | used == len + -> convert acc empty + | otherwise + -> convert acc $ BS (fp `plusForeignPtr` used) (len - used) + _ -> Nothing +#else + -- All done + consume Empty acc = convert acc Empty + -- Process next chunk + consume (Chunk (BS fp len) cs) acc + = case _digits q r fp len acc of + Result used acc' + | used == len + -- process remaining chunks + -> consume cs acc' + | otherwise + -- ran into a non-digit + -> convert acc' $ + Chunk (BS (fp `plusForeignPtr` used) (len - used)) cs + _ -> Nothing +#endif + convert :: Word64 -> ByteString -> Maybe (a, ByteString) + convert !acc rest = + let !i = case r of + -- minBound @Int* `mod` 10 == 8 + 8 -> negate $ fromIntegral @Word64 @a acc + _ -> fromIntegral @Word64 @a acc + in Just (i, rest) + + -- The quotient of 'maxBound' divided by 10 is needed for + -- overflow checks, once the accumulator exceeds this value + -- no further digits can be added. If equal, the last digit + -- must not exceed the `r` value (max/min bound `mod` 10). + -- + q = fromIntegral @a @Word64 maxBound `div` 10 + +----- Per chunk decoder + +-- | Process as many digits as we can, returning the additional +-- number of digits found and the updated accumulator. If the +-- accumulator would overflow return 'Overflow'. +-- +_digits :: Word64 -- ^ maximum non-overflow value `div` 10 + -> Word64 -- ^ maximum non-overflow vavlue `mod` 10 + -> ForeignPtr Word8 -- ^ Input buffer + -> Int -- ^ Input length + -> Word64 -- ^ Accumulated value of leading digits + -> Result -- ^ Bytes read and final accumulator, + -- or else overflow indication +{-# INLINE _digits #-} +_digits !q !r fp len a = BI.accursedUnutterablePerformIO $ + BI.unsafeWithForeignPtr fp $ \ ptr -> do + let end = ptr `plusPtr` len + go ptr end ptr a + where + go !start !end = loop + where + loop !ptr !acc = getDigit >>= \ !d -> + if | d > 9 + -> return $ Result (ptr `minusPtr` start) acc + | acc < q || acc == q && d <= r + -> loop (ptr `plusPtr` 1) (acc * 10 + d) + | otherwise + -> return Overflow + where + getDigit :: IO Word64 + getDigit + | ptr /= end = fromDigit <$> peek ptr + | otherwise = pure 10 -- End of input + {-# NOINLINE getDigit #-} + -- 'getDigit' makes it possible to implement a single success + -- exit point from the loop. If instead we return 'Result' + -- from multiple places, when '_digits' is inlined we get (at + -- least GHC 8.10 through 9.2) for each exit path a separate + -- join point implementing the continuation code. GHC ticket + -- . + -- + -- The NOINLINE pragma is required to avoid inlining branches + -- that would restore multiple exit points. + +fromDigit :: Word8 -> Word64 +{-# INLINE fromDigit #-} +fromDigit = \ !w -> fromIntegral w - 0x30 -- i.e. w - '0' diff --git a/Data/ByteString/Lazy/ReadNat.hs b/Data/ByteString/Lazy/ReadNat.hs new file mode 100644 index 000000000..240bb8ed9 --- /dev/null +++ b/Data/ByteString/Lazy/ReadNat.hs @@ -0,0 +1,258 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +-- This file is included by "Data.ByteString.ReadInt", after defining +-- "BYTESTRING_STRICT". The two modules are largely identical, except for the +-- choice of ByteString type and the loops in `readNatural`, where the lazy +-- version needs to nest the inner loop inside a loop over the constituent +-- chunks. + +#ifdef BYTESTRING_STRICT +module Data.ByteString.ReadNat +#else +module Data.ByteString.Lazy.ReadNat +#endif + ( readInteger + , readNatural + ) where + +import qualified Data.ByteString.Internal as BI +#ifdef BYTESTRING_STRICT +import Data.ByteString +#else +import Data.ByteString.Lazy +import Data.ByteString.Lazy.Internal +#endif +import Data.Bits (finiteBitSize) +import Data.ByteString.Internal (pattern BS, plusForeignPtr) +import Data.Word +import Foreign.ForeignPtr (ForeignPtr) +import Foreign.Ptr (Ptr, minusPtr, plusPtr) +import Foreign.Storable (Storable(..)) +import Numeric.Natural (Natural) + +----- Public API + +-- | 'readInteger' reads an 'Integer' from the beginning of the 'ByteString'. +-- If there is no 'Integer' at the beginning of the string, it returns +-- 'Nothing', otherwise it just returns the 'Integer' read, and the rest of +-- the string. +-- +-- 'readInteger' does not ignore leading whitespace, the value must start +-- immediately at the beginning of the input string. +-- +-- ==== __Examples__ +-- >>> readInteger "-000111222333444555666777888999 all done" +-- Just (-111222333444555666777888999," all done") +-- >>> readInteger "+1: readInteger also accepts a leading '+'" +-- Just (1, ": readInteger also accepts a leading '+'") +-- >>> readInteger "not a decimal number" +-- Nothing +-- +readInteger :: ByteString -> Maybe (Integer, ByteString) +readInteger = \ bs -> do + (w, s) <- uncons bs + let d = fromDigit w + if | d <= 9 -> unsigned d s -- leading digit + | w == 0x2d -> negative s -- minus sign + | w == 0x2b -> positive s -- plus sign + | otherwise -> Nothing -- not a number + where + unsigned :: Word -> ByteString -> Maybe (Integer, ByteString) + unsigned d s = + let (!n, rest) = _readDecimal d s + !i = toInteger n + in Just (i, rest) + + positive :: ByteString -> Maybe (Integer, ByteString) + positive bs = do + (w, s) <- uncons bs + let d = fromDigit w + if | d <= 9 -> unsigned d s + | otherwise -> Nothing + + negative :: ByteString -> Maybe (Integer, ByteString) + negative bs = do + (w, s) <- uncons bs + let d = fromDigit w + if | d > 9 -> Nothing + | otherwise -> let (n, rest) = _readDecimal d s + !i = negate $ toInteger n + in Just (i, rest) + +-- | 'readNatural' reads a 'Natural' number from the beginning of the +-- 'ByteString'. If there is no 'Natural' number at the beginning of the +-- string, it returns 'Nothing', otherwise it just returns the number read, and +-- the rest of the string. +-- +-- 'readNatural' does not ignore leading whitespace, the value must start with +-- a decimal digit immediately at the beginning of the input string. Leading +-- @+@ signs are not accepted. +-- +-- ==== __Examples__ +-- >>> readNatural "000111222333444555666777888999 all done" +-- Just (111222333444555666777888999," all done") +-- >>> readNatural "+000111222333444555666777888999 explicit sign" +-- Nothing +-- >>> readNatural "not a decimal number" +-- Nothing +-- +readNatural :: ByteString -> Maybe (Natural, ByteString) +readNatural bs = do + (w, s) <- uncons bs + let d = fromDigit w + if | d <= 9 -> Just $! _readDecimal d s + | otherwise -> Nothing + +----- Internal implementation + +-- | Intermediate result from scanning a chunk, final output is +-- obtained via `convert` after all the chunks are processed. +-- +data Result = Result !Int -- Bytes consumed + !Word -- Value of LSW + !Int -- Digits in LSW + [Natural] -- Little endian MSW list + +_readDecimal :: Word -> ByteString -> (Natural, ByteString) +_readDecimal = + -- Having read one digit, we're about to read the 2nd So the digit count + -- up to 'safeLog' starts at 2. + consume [] 2 + where + consume :: [Natural] -> Int -> Word -> ByteString + -> (Natural, ByteString) +#ifdef BYTESTRING_STRICT + consume ns cnt acc (BS fp len) = + -- Having read one digit, we're about to read the 2nd + -- So the digit count up to 'safeLog' starts at 2. + case natdigits fp len acc cnt ns of + Result used acc' cnt' ns' + | used == len + -> convert acc' cnt' ns' $ empty + | otherwise + -> convert acc' cnt' ns' $ + BS (fp `plusForeignPtr` used) (len - used) +#else + -- All done + consume ns cnt acc Empty = convert acc cnt ns Empty + -- Process next chunk + consume ns cnt acc (Chunk (BS fp len) cs) + = case natdigits fp len acc cnt ns of + Result used acc' cnt' ns' + | used == len -- process more chunks + -> consume ns' cnt' acc' cs + | otherwise -- ran into a non-digit + -> let c = Chunk (BS (fp `plusForeignPtr` used) (len - used)) cs + in convert acc' cnt' ns' c +#endif + convert !acc !cnt !ns rest = + let !n = combine acc cnt ns + in (n, rest) + + -- | Merge least-significant word with reduction of of little-endian tail. + -- + -- The input is: + -- + -- * Least significant digits as a 'Word' (LSW) + -- * The number of digits that went into the LSW + -- * All the remaining digit groups ('safeLog' digits each), + -- in little-endian order + -- + -- The result is obtained by pairwise recursive combining of all the + -- full size digit groups, followed by multiplication by @10^cnt@ and + -- addition of the LSW. + combine :: Word -- ^ value of LSW + -> Int -- ^ count of digits in LSW + -> [Natural] -- ^ tail elements (base @10^'safeLog'@) + -> Natural + {-# INLINE combine #-} + combine !acc !_ [] = wordToNatural acc + combine !acc !cnt ns = + wordToNatural (10^cnt) * combine1 safeBase ns + wordToNatural acc + + -- | Recursive reduction of little-endian sequence of 'Natural'-valued + -- /digits/ in base @base@ (a power of 10). The base is squared after + -- each round. This shows better asymptotic performance than one word + -- at a time multiply-add folds. See: + -- + -- + combine1 :: Natural -> [Natural] -> Natural + combine1 _ [n] = n + combine1 base ns = combine1 (base * base) (combine2 base ns) + + -- | One round pairwise merge of numbers in base @base@. + combine2 :: Natural -> [Natural] -> [Natural] + combine2 base (n:m:ns) = let !t = m * base + n in t : combine2 base ns + combine2 _ ns = ns + +-- The intermediate representation is a little-endian sequence in base +-- @10^'safeLog'@, prefixed by an initial element in base @10^cnt@ for some +-- @cnt@ between 1 and 'safeLog'. The final result is obtained by recursive +-- pairwise merging of the tail followed by a final multiplication by @10^cnt@ +-- and addition of the head. +-- +natdigits :: ForeignPtr Word8 -- ^ Input chunk + -> Int -- ^ Chunk length + -> Word -- ^ accumulated element + -> Int -- ^ partial digit count + -> [Natural] -- ^ accumulated MSB elements + -> Result +{-# INLINE natdigits #-} +natdigits fp len = \ acc cnt ns -> + BI.accursedUnutterablePerformIO $ + BI.unsafeWithForeignPtr fp $ \ ptr -> do + let end = ptr `plusPtr` len + go ptr end acc cnt ns ptr + where + go !start !end = loop + where + loop :: Word -> Int -> [Natural] -> Ptr Word8 -> IO Result + loop !acc !cnt ns !ptr = getDigit >>= \ !d -> + if | d > 9 + -> return $ Result (ptr `minusPtr` start) acc cnt ns + | cnt < safeLog + -> loop (10*acc + d) (cnt+1) ns $ ptr `plusPtr` 1 + | otherwise + -> let !acc' = wordToNatural acc + in loop d 1 (acc' : ns) $ ptr `plusPtr` 1 + where + getDigit | ptr /= end = fromDigit <$> peek ptr + | otherwise = pure 10 -- End of input + {-# NOINLINE getDigit #-} + -- 'getDigit' makes it possible to implement a single success + -- exit point from the loop. If instead we return 'Result' + -- from multiple places, when 'natdigits' is inlined we get (at + -- least GHC 8.10 through 9.2) for each exit path a separate + -- join point implementing the continuation code. GHC ticket + -- . + -- + -- The NOINLINE pragma is required to avoid inlining branches + -- that would restore multiple exit points. + +----- Misc functions + +-- | Largest decimal digit count that never overflows the accumulator +-- +safeLog :: Int +safeLog + | finiteBitSize @Word 0 >= 128 = 38 + | finiteBitSize @Word 0 >= 64 = 19 + | finiteBitSize @Word 0 >= 32 = 9 + | otherwise = 4 -- surely at least 14 bits + +-- | 10-power base for little-endian sequence of ~Word-sized "digits" +safeBase :: Natural +safeBase = 10 ^ safeLog + +fromDigit :: Word8 -> Word +{-# INLINE fromDigit #-} +fromDigit = \ !w -> fromIntegral w - 0x30 -- i.e. w - '0' + +wordToNatural :: Word -> Natural +{-# INLINE wordToNatural #-} +wordToNatural = fromIntegral diff --git a/Data/ByteString/ReadInt.hs b/Data/ByteString/ReadInt.hs new file mode 100644 index 000000000..19655ebb8 --- /dev/null +++ b/Data/ByteString/ReadInt.hs @@ -0,0 +1,3 @@ +{-# LANGUAGE CPP #-} +#define BYTESTRING_STRICT +#include "Lazy/ReadInt.hs" diff --git a/Data/ByteString/ReadNat.hs b/Data/ByteString/ReadNat.hs new file mode 100644 index 000000000..4997dddc8 --- /dev/null +++ b/Data/ByteString/ReadNat.hs @@ -0,0 +1,3 @@ +{-# LANGUAGE CPP #-} +#define BYTESTRING_STRICT +#include "Lazy/ReadNat.hs" diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index c6572f6bb..b56c85c61 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -42,6 +42,7 @@ import BenchBoundsCheckFusion import BenchCount import BenchCSV import BenchIndices +import BenchReadInt ------------------------------------------------------------------------------ -- Benchmark support @@ -467,4 +468,5 @@ main = do , benchCount , benchCSV , benchIndices + , benchReadInt ] diff --git a/bench/BenchReadInt.hs b/bench/BenchReadInt.hs new file mode 100644 index 000000000..32d6bc1b2 --- /dev/null +++ b/bench/BenchReadInt.hs @@ -0,0 +1,144 @@ +-- | +-- Copyright : (c) 2021 Viktor Dukhovni +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : Viktor Dukhovni +-- +-- Benchmark readInt and variants, readWord and variants, +-- readInteger and readNatural + +{-# LANGUAGE + CPP + , BangPatterns + , OverloadedStrings + , TypeApplications + , ScopedTypeVariables + #-} + +module BenchReadInt (benchReadInt) where + +import qualified Data.ByteString.Builder as B +import qualified Data.ByteString.Char8 as S +import qualified Data.ByteString.Lazy.Char8 as L +import Test.Tasty.Bench +import Data.Int +import Data.Word +import Numeric.Natural +#if !(MIN_VERSION_base(4,11,0)) +import Data.Semigroup (Semigroup((<>))) +#endif +import Data.Monoid (mconcat) + +------------------------------------------------------------------------------ +-- Benchmark +------------------------------------------------------------------------------ + +-- Sum space-separated integers in a ByteString. +loopS :: Integral a + => (S.ByteString -> Maybe (a, S.ByteString)) -> S.ByteString -> a +loopS rd = go 0 + where + go !acc !bs = case rd bs of + Just (i, t) -> case S.uncons t of + Just (_, t') -> go (acc + i) t' + Nothing -> acc + i + Nothing -> acc + +-- Sum space-separated integers in a ByteString. +loopL :: Integral a + => (L.ByteString -> Maybe (a, L.ByteString)) -> L.ByteString -> a +loopL rd = go 0 + where + go !acc !bs = case rd bs of + Just (i, t) -> case L.uncons t of + Just (_, t') -> go (acc + i) t' + Nothing -> acc + i + Nothing -> acc + +benchReadInt :: Benchmark +benchReadInt = bgroup "Read Integral" + [ bgroup "Strict" + [ bench "ReadInt" $ nf (loopS S.readInt) intS + , bench "ReadInt8" $ nf (loopS S.readInt8) int8S + , bench "ReadInt16" $ nf (loopS S.readInt16) int16S + , bench "ReadInt32" $ nf (loopS S.readInt32) int32S + , bench "ReadInt64" $ nf (loopS S.readInt64) int64S + , bench "ReadWord" $ nf (loopS S.readWord) wordS + , bench "ReadWord8" $ nf (loopS S.readWord8) word8S + , bench "ReadWord16" $ nf (loopS S.readWord16) word16S + , bench "ReadWord32" $ nf (loopS S.readWord32) word32S + , bench "ReadWord64" $ nf (loopS S.readWord64) word64S + , bench "ReadInteger" $ nf (loopS S.readInteger) bignatS + , bench "ReadNatural" $ nf (loopS S.readNatural) bignatS + , bench "ReadInteger small" $ nf (loopS S.readInteger) intS + , bench "ReadNatural small" $ nf (loopS S.readNatural) wordS + ] + + , bgroup "Lazy" + [ bench "ReadInt" $ nf (loopL L.readInt) intL + , bench "ReadInt8" $ nf (loopL L.readInt8) int8L + , bench "ReadInt16" $ nf (loopL L.readInt16) int16L + , bench "ReadInt32" $ nf (loopL L.readInt32) int32L + , bench "ReadInt64" $ nf (loopL L.readInt64) int64L + , bench "ReadWord" $ nf (loopL L.readWord) wordL + , bench "ReadWord8" $ nf (loopL L.readWord8) word8L + , bench "ReadWord16" $ nf (loopL L.readWord16) word16L + , bench "ReadWord32" $ nf (loopL L.readWord32) word32L + , bench "ReadWord64" $ nf (loopL L.readWord64) word64L + , bench "ReadInteger" $ nf (loopL L.readInteger) bignatL + , bench "ReadNatural" $ nf (loopL L.readNatural) bignatL + , bench "ReadInteger small" $ nf (loopL L.readInteger) intL + , bench "ReadNatural small" $ nf (loopL L.readNatural) wordL + ] + ] + where + mkWordL :: forall a. (Integral a, Bounded a) + => (a -> B.Builder) -> L.ByteString + mkWordL f = B.toLazyByteString b + where b = mconcat [f i <> B.char8 ' ' | i <- [n-255..n]] + n = maxBound @a + mkWordS f = S.toStrict $ mkWordL f + + mkIntL :: forall a. (Integral a, Bounded a) + => (a -> B.Builder) -> L.ByteString + mkIntL f = B.toLazyByteString b + where b = mconcat [f (i + 128) <> B.char8 ' ' | i <- [n-255..n]] + n = maxBound @a + mkIntS f = S.toStrict $ mkIntL f + + wordS, word8S, word16S, word32S, word64S :: S.ByteString + !wordS = mkWordS B.wordDec + !word8S = mkWordS B.word8Dec + !word16S = mkWordS B.word16Dec + !word32S = mkWordS B.word32Dec + !word64S = mkWordS B.word64Dec + + intS, int8S, int16S, int32S, int64S :: S.ByteString + !intS = mkIntS B.intDec + !int8S = mkIntS B.int8Dec + !int16S = mkIntS B.int16Dec + !int32S = mkIntS B.int32Dec + !int64S = mkIntS B.int64Dec + + word8L, word16L, word32L, word64L :: L.ByteString + !wordL = mkWordL B.wordDec + !word8L = mkWordL B.word8Dec + !word16L = mkWordL B.word16Dec + !word32L = mkWordL B.word32Dec + !word64L = mkWordL B.word64Dec + + intL, int8L, int16L, int32L, int64L :: L.ByteString + !intL = mkIntL B.intDec + !int8L = mkIntL B.int8Dec + !int16L = mkIntL B.int16Dec + !int32L = mkIntL B.int32Dec + !int64L = mkIntL B.int64Dec + + bignatL :: L.ByteString + !bignatL = B.toLazyByteString b + where b = mconcat [B.integerDec (powpow i) <> B.char8 ' ' | i <- [0..13]] + powpow :: Word -> Integer + powpow n = 2^(2^n :: Word) + + bignatS :: S.ByteString + !bignatS = S.toStrict bignatL diff --git a/bytestring.cabal b/bytestring.cabal index ea9fd9d70..262287626 100644 --- a/bytestring.cabal +++ b/bytestring.cabal @@ -100,6 +100,10 @@ library Data.ByteString.Builder.RealFloat.Internal Data.ByteString.Builder.RealFloat.TableGenerator Data.ByteString.Lazy.Internal.Deque + Data.ByteString.Lazy.ReadInt + Data.ByteString.Lazy.ReadNat + Data.ByteString.ReadInt + Data.ByteString.ReadNat default-language: Haskell2010 other-extensions: CPP, @@ -176,6 +180,7 @@ benchmark bytestring-bench BenchCount BenchCSV BenchIndices + BenchReadInt type: exitcode-stdio-1.0 hs-source-dirs: bench default-language: Haskell2010 diff --git a/tests/Properties.hs b/tests/Properties.hs index 3492c3ca8..3dc423a3c 100644 --- a/tests/Properties.hs +++ b/tests/Properties.hs @@ -1,7 +1,16 @@ +{-# LANGUAGE AllowAmbiguousTypes #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MagicHash #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} {-# LANGUAGE UnboxedTuples #-} +-- We need @AllowAmbiguousTypes@ in order to be able to use @TypeApplications@ +-- to disambiguate the desired instance of class methods whose instance cannot +-- be inferred from the caller's context. We would otherwise have to use +-- proxy arguments. Here the 'RdInt' class methods used to generate tests for +-- all the various 'readInt' types require explicit type applications. + module Properties (testSuite) where import Foreign.C.String (withCString) @@ -21,7 +30,7 @@ import qualified Data.List as List import Data.Char import Data.Word import Data.Maybe -import Data.Int (Int64) +import Data.Int (Int8, Int16, Int32, Int64) import Data.Semigroup import GHC.Exts (Int(..), newPinnedByteArray#, unsafeFreezeByteArray#) import GHC.ST (ST(..), runST) @@ -82,36 +91,54 @@ prop_lines_lazy2 = prop_strip x = C.strip x == (C.dropSpace . C.reverse . C.dropSpace . C.reverse) x --- Ensure that readInt and readInteger over lazy ByteStrings are not +class (Bounded a, Integral a, Show a) => RdInt a where + rdIntC :: C.ByteString -> Maybe (a, C.ByteString) + rdIntD :: D.ByteString -> Maybe (a, D.ByteString) + +instance RdInt Int where { rdIntC = C.readInt; rdIntD = D.readInt } +instance RdInt Int8 where { rdIntC = C.readInt8; rdIntD = D.readInt8 } +instance RdInt Int16 where { rdIntC = C.readInt16; rdIntD = D.readInt16 } +instance RdInt Int32 where { rdIntC = C.readInt32; rdIntD = D.readInt32 } +instance RdInt Int64 where { rdIntC = C.readInt64; rdIntD = D.readInt64 } +-- +instance RdInt Word where { rdIntC = C.readWord; rdIntD = D.readWord } +instance RdInt Word8 where { rdIntC = C.readWord8; rdIntD = D.readWord8 } +instance RdInt Word16 where { rdIntC = C.readWord16; rdIntD = D.readWord16 } +instance RdInt Word32 where { rdIntC = C.readWord32; rdIntD = D.readWord32 } +instance RdInt Word64 where { rdIntC = C.readWord64; rdIntD = D.readWord64 } + +smax :: forall a. (Bounded a, Show a) => String +smax = show $ maxBound @a +smax1 :: forall a. (Bounded a, Integral a) => String +smax1 = show $ fromIntegral @a @Integer maxBound + 1 +smax10 :: forall a. (Bounded a, Integral a) => String +smax10 = show $ fromIntegral @a @Integer maxBound + 10 + +smin :: forall a. (Bounded a, Show a) => String +smin = show (minBound @a) +smin1 :: forall a. (Bounded a, Integral a) => String +smin1 = show $ fromIntegral @a @Integer minBound - 1 +smin10 :: forall a. (Bounded a, Integral a) => String +smin10 = show $ fromIntegral @a @Integer minBound - 10 + +-- Ensure that readWord64 and readInteger over lazy ByteStrings are not -- excessively strict. -prop_readIntSafe = (fst . fromJust . D.readInt) (Chunk (C.pack "1z") Empty) == 1 -prop_readIntUnsafe = (fst . fromJust . D.readInt) (Chunk (C.pack "2z") undefined) == 2 +prop_readWordSafe = (fst . fromJust . D.readWord64) (Chunk (C.pack "1z") Empty) == 1 +prop_readWordUnsafe = (fst . fromJust . D.readWord64) (Chunk (C.pack "2z") undefined) == 2 prop_readIntegerSafe = (fst . fromJust . D.readInteger) (Chunk (C.pack "1z") Empty) == 1 prop_readIntegerUnsafe = (fst . fromJust . D.readInteger) (Chunk (C.pack "2z") undefined) == 2 -prop_readIntBoundsCC = let !smax = show (maxBound :: Int) - !smin = show (minBound :: Int) - !smax1 = show (fromIntegral (maxBound :: Int) + 1 :: Integer) - !smin1 = show (fromIntegral (minBound :: Int) - 1 :: Integer) - !smax10 = show (fromIntegral (maxBound :: Int) + 10 :: Integer) - !smin10 = show (fromIntegral (minBound :: Int) - 10 :: Integer) - -- - in C.readInt (spack smax) == good maxBound - && C.readInt (spack smin) == good minBound - -- - && C.readInt (spackPlus smax) == good maxBound - && C.readInt (spackMinus smax) == good (negate maxBound) - -- - && C.readInt (spackZeros smax) == good maxBound - && C.readInt (spackZeros smin) == good minBound - -- - && C.readInt (spack smax1 ) == Nothing - && C.readInt (spack smin1 ) == Nothing - -- - && C.readInt (spack smax10) == Nothing - && C.readInt (spack smin10) == Nothing - -- - && C.readInt (spackLong smax) == Nothing - && C.readInt (spackLong smin) == Nothing +prop_readNaturalSafe = (fst . fromJust . D.readNatural) (Chunk (C.pack "1z") Empty) == 1 +prop_readNaturalUnsafe = (fst . fromJust . D.readNatural) (Chunk (C.pack "2z") undefined) == 2 +prop_readIntBoundsCC = rdWordBounds @Word + && rdWordBounds @Word8 + && rdWordBounds @Word16 + && rdWordBounds @Word32 + && rdWordBounds @Word64 + && rdIntBounds @Int + && rdIntBounds @Int8 + && rdIntBounds @Int16 + && rdIntBounds @Int32 + && rdIntBounds @Int64 where tailStr = " tail" zeroStr = "000000000000000000000000000" @@ -124,36 +151,47 @@ prop_readIntBoundsCC = let !smax = show (maxBound :: Int) '-':num -> C.pack $ '-' : zeroStr ++ num ++ tailStr num -> C.pack $ zeroStr ++ num ++ tailStr good i = Just (i, C.pack tailStr) -prop_readIntBoundsLC = let !smax = show (maxBound :: Int) - !smin = show (minBound :: Int) - !smax1 = show (fromIntegral (maxBound :: Int) + 1 :: Integer) - !smin1 = show (fromIntegral (minBound :: Int) - 1 :: Integer) - !smax10 = show (fromIntegral (maxBound :: Int) + 10 :: Integer) - !smin10 = show (fromIntegral (minBound :: Int) - 10 :: Integer) - -- Plain min/maxBound - in LC.readInt (spack smax) == good maxBound - && LC.readInt (spack smin) == good minBound - -- With explicit [+-] sign for maxBound - && LC.readInt (spackPlus smax) == good maxBound - && LC.readInt (spackMinus smax) == good (negate maxBound) - -- With leading zeros - && LC.readInt (spackZeros smax) == good maxBound - && LC.readInt (spackZeros smin) == good minBound - -- Overflow in last digit - && LC.readInt (spack smax1 ) == Nothing - && LC.readInt (spack smin1 ) == Nothing - -- Overflow in 2nd-last digit - && LC.readInt (spack smax10) == Nothing - && LC.readInt (spack smin10) == Nothing - -- Overflow across chunk boundary - && LC.readInt (spackLong1 smax) == Nothing - && LC.readInt (spackLong1 smin) == Nothing - -- Overflow within chunk - && LC.readInt (spackLong2 smax) == Nothing - && LC.readInt (spackLong2 smin) == Nothing - -- Sign with no digits - && LC.readInt (LC.pack "+ foo") == Nothing - && LC.readInt (LC.pack "-bar") == Nothing + -- + rdWordBounds :: forall a. RdInt a => Bool + rdWordBounds = + -- Upper bound + rdIntC @a (spack (smax @a)) == good maxBound + -- With leading zeros + && rdIntC @a (spackZeros (smax @a)) == good maxBound + -- Overflow in last digit + && rdIntC @a (spack (smax1 @a)) == Nothing + -- Overflow in 2nd-last digit + && rdIntC @a (spack (smax10 @a)) == Nothing + -- Trailing zeros + && rdIntC @a (spackLong (smax @a)) == Nothing + -- + rdIntBounds :: forall a. RdInt a => Bool + rdIntBounds = + rdWordBounds @a + -- Lower bound + && rdIntC @a (spack (smin @a)) == good minBound + -- With leading signs + && rdIntC @a (spackPlus (smax @a)) == good maxBound + && rdIntC @a (spackMinus (smax @a)) == good (negate maxBound) + -- With leading zeros + && rdIntC @a (spackZeros (smax @a)) == good maxBound + -- Underflow in last digit + && rdIntC @a (spack (smin1 @a)) == Nothing + -- Underflow in 2nd-last digit + && rdIntC @a (spack (smin10 @a)) == Nothing + -- Trailing zeros + && rdIntC @a (spackLong (smin @a)) == Nothing + +prop_readIntBoundsLC = rdWordBounds @Word + && rdWordBounds @Word8 + && rdWordBounds @Word16 + && rdWordBounds @Word32 + && rdWordBounds @Word64 + && rdIntBounds @Int + && rdIntBounds @Int8 + && rdIntBounds @Int16 + && rdIntBounds @Int32 + && rdIntBounds @Int64 where tailStr = " tail" zeroStr = "000000000000000000000000000" @@ -167,6 +205,43 @@ prop_readIntBoundsLC = let !smax = show (maxBound :: Int) '-':num -> LC.pack ('-' : zeroStr) `D.append` LC.pack (num ++ tailStr) num -> LC.pack $ zeroStr ++ num ++ tailStr good i = Just (i, LC.pack tailStr) + -- + rdWordBounds :: forall a. RdInt a => Bool + rdWordBounds = + -- Upper bound + rdIntD @a (spack (smax @a)) == good maxBound + -- With leading zeros + && rdIntD @a (spackZeros (smax @a)) == good maxBound + -- Overflow in last digit + && rdIntD @a (spack (smax1 @a)) == Nothing + -- Overflow in 2nd-last digit + && rdIntD @a (spack (smax10 @a)) == Nothing + -- Overflow across chunk boundary + && rdIntD @a (spackLong1 (smax @a)) == Nothing + -- Overflow within chunk + && rdIntD @a (spackLong2 (smax @a)) == Nothing + -- Sign with no digits + && rdIntD @a (LC.pack "+ foo") == Nothing + && rdIntD @a (LC.pack "-bar") == Nothing + -- + rdIntBounds :: forall a. RdInt a => Bool + rdIntBounds = + rdWordBounds @a + -- Lower bound + && rdIntD @a (spack (smin @a)) == good minBound + -- With leading signs + && rdIntD @a (spackPlus (smax @a)) == good maxBound + && rdIntD @a (spackMinus (smax @a)) == good (negate maxBound) + -- With leading zeros + && rdIntD @a (spackZeros (smin @a)) == good minBound + -- Overflow in last digit + && rdIntD @a (spack (smin1 @a)) == Nothing + -- Overflow in 2nd-last digit + && rdIntD @a (spack (smin10 @a)) == Nothing + -- Overflow across chunk boundary + && rdIntD @a (spackLong1 (smin @a)) == Nothing + -- Overflow within chunk + && rdIntD @a (spackLong2 (smin @a)) == Nothing ------------------------------------------------------------------------ @@ -536,12 +611,14 @@ misc_tests = , testProperty "strip" prop_strip , testProperty "isSpace" prop_isSpaceWord8 - , testProperty "readIntSafe" prop_readIntSafe - , testProperty "readIntUnsafe" prop_readIntUnsafe + , testProperty "readWordSafe" prop_readWordSafe + , testProperty "readWordUnsafe" prop_readWordUnsafe , testProperty "readIntBoundsCC" prop_readIntBoundsCC , testProperty "readIntBoundsLC" prop_readIntBoundsLC , testProperty "readIntegerSafe" prop_readIntegerSafe , testProperty "readIntegerUnsafe" prop_readIntegerUnsafe + , testProperty "readNaturalSafe" prop_readNaturalSafe + , testProperty "readNaturalUnsafe" prop_readNaturalUnsafe ] strictness_checks = diff --git a/tests/Properties/ByteString.hs b/tests/Properties/ByteString.hs index d87c9333c..a07c62bbc 100644 --- a/tests/Properties/ByteString.hs +++ b/tests/Properties/ByteString.hs @@ -4,7 +4,16 @@ -- License : BSD-style {-# LANGUAGE CPP #-} +{-# LANGUAGE AllowAmbiguousTypes #-} {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +-- We need @AllowAmbiguousTypes@ in order to be able to use @TypeApplications@ +-- to disambiguate the desired instance of class methods whose instance cannot +-- be inferred from the caller's context. We would otherwise have to use +-- proxy arguments. Here the 'RdInt' class methods used to generate tests for +-- all the various 'readInt' types require explicit type applications. -- We are happy to sacrifice optimizations in exchange for faster compilation, -- but need to test rewrite rules. As one can check using -ddump-rule-firings, @@ -30,8 +39,6 @@ import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B (invariant) #endif -import Data.Word - #else #ifndef BYTESTRING_LAZY @@ -43,6 +50,9 @@ import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy.Internal as B (invariant) #endif +import Data.Int +import Numeric.Natural (Natural) + import Text.Read #endif @@ -54,6 +64,7 @@ import qualified Data.List as List import Data.Semigroup import Data.String import Data.Tuple +import Data.Word import Test.Tasty import Test.Tasty.QuickCheck import QuickCheckUtils @@ -64,6 +75,42 @@ toElem = id #else toElem :: Char8 -> Char toElem (Char8 c) = c + +class (Integral a, Show a) => RdInt a where + bread :: B.ByteString -> Maybe (a, B.ByteString) + sread :: String -> Maybe (a, String) + +instance RdInt Int where { bread = B.readInt; sread = readInt } +instance RdInt Int8 where { bread = B.readInt8; sread = readInt8 } +instance RdInt Int16 where { bread = B.readInt16; sread = readInt16 } +instance RdInt Int32 where { bread = B.readInt32; sread = readInt32 } +instance RdInt Int64 where { bread = B.readInt64; sread = readInt64 } +-- +instance RdInt Word where { bread = B.readWord; sread = readWord } +instance RdInt Word8 where { bread = B.readWord8; sread = readWord8 } +instance RdInt Word16 where { bread = B.readWord16; sread = readWord16 } +instance RdInt Word32 where { bread = B.readWord32; sread = readWord32 } +instance RdInt Word64 where { bread = B.readWord64; sread = readWord64 } +-- +instance RdInt Integer where { bread = B.readInteger; sread = readInteger } +instance RdInt Natural where { bread = B.readNatural; sread = readNatural } + +instance Arbitrary Natural where + arbitrary = i2n <$> arbitrary + where i2n :: Integer -> Natural + i2n i | i >= 0 = fromIntegral i + | otherwise = fromIntegral $ negate i + +testRdInt :: forall a. (Arbitrary a, RdInt a) => String -> TestTree +testRdInt s = testGroup s $ + [ testProperty "from string" $ \ prefix value suffix -> + let si = show @a value + b = prefix <> B.pack si <> suffix + in fmap (second B.unpack) (bread @a b) + === sread @a (B.unpack prefix ++ si ++ B.unpack suffix) + , testProperty "from number" $ \n -> + bread @a (B.pack (show n)) === Just (n, B.empty) + ] #endif tests :: [TestTree] @@ -520,14 +567,18 @@ tests = #ifdef BYTESTRING_CHAR8 , testProperty "isString" $ \x -> x === fromString (B.unpack x) - , testProperty "readInt 1" $ - \x -> fmap (second B.unpack) (B.readInt x) === readInt (B.unpack x) - , testProperty "readInt 2" $ - \n -> B.readInt (B.pack (show n)) === Just (n, B.empty) - , testProperty "readInteger 1" $ - \x -> fmap (second B.unpack) (B.readInteger x) === readInteger (B.unpack x) - , testProperty "readInteger 2" $ - \n -> B.readInteger (B.pack (show n)) === Just (n, B.empty) + , testRdInt @Int "readInt" + , testRdInt @Int8 "readInt8" + , testRdInt @Int16 "readInt16" + , testRdInt @Int32 "readInt32" + , testRdInt @Int64 "readInt64" + , testRdInt @Word "readWord" + , testRdInt @Word8 "readWord8" + , testRdInt @Word16 "readWord16" + , testRdInt @Word32 "readWord32" + , testRdInt @Word64 "readWord64" + , testRdInt @Integer "readInteger" + , testRdInt @Natural "readNatural" , testProperty "lines" $ \x -> map B.unpack (B.lines x) === lines (B.unpack x) , testProperty "lines \\n" $ once $ @@ -608,11 +659,71 @@ readInt xs = case readInteger xs of | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) otherwise -> Nothing +readWord :: String -> Maybe (Word, String) +readWord xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt8 :: String -> Maybe (Int8, String) +readInt8 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord8 :: String -> Maybe (Word8, String) +readWord8 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt16 :: String -> Maybe (Int16, String) +readInt16 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord16 :: String -> Maybe (Word16, String) +readWord16 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt32 :: String -> Maybe (Int32, String) +readInt32 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord32 :: String -> Maybe (Word32, String) +readWord32 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt64 :: String -> Maybe (Int64, String) +readInt64 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord64 :: String -> Maybe (Word64, String) +readWord64 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + readInteger :: String -> Maybe (Integer, String) readInteger ('+' : xs) = readIntegerUnsigned xs readInteger ('-' : xs) = fmap (first negate) (readIntegerUnsigned xs) readInteger xs = readIntegerUnsigned xs +readNatural :: String -> Maybe (Natural, String) +readNatural xs = case readIntegerUnsigned xs of + Just (y, zs) + | y >= 0 -> Just (fromIntegral @Integer @Natural y, zs) + _ -> Nothing + readIntegerUnsigned :: String -> Maybe (Integer, String) readIntegerUnsigned xs = case readMaybe ys of Just y -> Just (y, zs)