diff --git a/Changelog.md b/Changelog.md index b2d82edb8..5d8a3d7f0 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,5 +1,6 @@ [0.12.0.0] — Unreleased +* [New sized and/or unsigned variants of `readInt` and `readInteger`](https://github.com/haskell/bytestring/pull/438) * [`readInt` returns `Nothing`, if the sequence of digits cannot be represented by an `Int`, instead of overflowing silently](https://github.com/haskell/bytestring/pull/309) * [Remove `zipWith` rewrite rule](https://github.com/haskell/bytestring/pull/387) diff --git a/Data/ByteString/Char8.hs b/Data/ByteString/Char8.hs index 9e1bd71bc..f4cfc3285 100644 --- a/Data/ByteString/Char8.hs +++ b/Data/ByteString/Char8.hs @@ -185,7 +185,19 @@ module Data.ByteString.Char8 ( -- * Reading from ByteStrings readInt, + readInt64, + readInt32, + readInt16, + readInt8, + + readWord, + readWord64, + readWord32, + readWord16, + readWord8, + readInteger, + readNatural, -- * Low level CString conversions @@ -264,6 +276,8 @@ import Data.ByteString (empty,null,length,tail,init,append ) import Data.ByteString.Internal +import Data.ByteString.ReadInt +import Data.ByteString.ReadNat import Data.Char ( isSpace ) -- See bytestring #70 @@ -843,14 +857,6 @@ unzip :: [(Char,Char)] -> (ByteString,ByteString) unzip ls = (pack (P.map fst ls), pack (P.map snd ls)) {-# INLINE unzip #-} --- | A variety of 'head' for non-empty ByteStrings. 'unsafeHead' omits --- the check for the empty case, which is good for performance, but --- there is an obligation on the programmer to provide a proof that the --- ByteString is non-empty. -unsafeHead :: ByteString -> Char -unsafeHead = w2c . B.unsafeHead -{-# INLINE unsafeHead #-} - -- --------------------------------------------------------------------- -- Things that depend on the encoding @@ -977,130 +983,6 @@ unwords :: [ByteString] -> ByteString unwords = intercalate (singleton ' ') {-# INLINE unwords #-} --- --------------------------------------------------------------------- --- Reading from ByteStrings - --- | Try to read an 'Int' value from the 'ByteString', returning @Just (val, --- str)@ on success, where @val@ is the value read and @str@ is the rest of the --- input string. If the sequence of digits decodes to a value larger than can --- be represented by an 'Int', the returned value will be 'Nothing'. --- --- 'readInt' does not ignore leading whitespace, the value must start --- immediately at the beginning of the input stream. --- --- ==== __Examples__ --- >>> readInt "-1729 = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3" --- Just (-1729," = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3") --- >>> readInt "not a decimal number") --- Nothing --- >>> readInt "12345678901234567890 overflows maxBound") --- Nothing --- >>> readInt "-12345678901234567890 underflows minBound") --- Nothing --- -readInt :: ByteString -> Maybe (Int, ByteString) -{-# INLINABLE readInt #-} -readInt bs = case B.uncons bs of - Just (w, rest) | w - 0x30 <= 9 -> readDec True bs -- starts with digit - | w == 0x2d -> readDec False rest -- starts with minus - | w == 0x2b -> readDec True rest -- starts with plus - _ -> Nothing -- not signed decimal - where - -- | Read a decimal 'Int' without overflow. The caller has already - -- read any explicit sign (setting @positive@ to 'False' as needed). - -- Here we just deal with the digits. - {-# INLINE readDec #-} - readDec !positive (B.BS fp len) = B.accursedUnutterablePerformIO $ - unsafeWithForeignPtr fp $ \ptr -> do - let end = ptr `plusPtr` len - (!n, !a, !inRange) <- if positive - then digits intmaxQuot10 intmaxRem10 end ptr 0 0 - else digits intminQuot10 intminRem10 end ptr 0 0 - if inRange - then if n < len - then let rest = B.BS (fp `B.plusForeignPtr` n) (len - n) - in return $! result n a rest - else return $! result n a B.empty - else return Nothing - where - -- | Process as many digits as we can, returning the additional - -- number of digits found, the final accumulator, and whether - -- the input decimal did not overflow prior to processing all - -- the provided digits (end of input or non-digit encountered). - digits !maxq !maxr !e !ptr = go ptr - where - go :: Ptr Word8 -> Int -> Word -> IO (Int, Word, Bool) - go !p !b !a | p == e = return (b, a, True) - go !p !b !a = do - !w <- fromIntegral <$> peek p - let !d = w - 0x30 - if d > 9 -- No more digits - then return (b, a, True) - else if a < maxq -- Look for more - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else if a > maxq -- overflow - then return (b, a, False) - else if d <= maxr -- Ideally this will be the last digit - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else return (b, a, False) -- overflow - - -- | Plausible success, provided we got at least one digit! - result !nbytes !acc str - | nbytes > 0 = let !i = w2int acc in Just (i, str) - | otherwise = Nothing - - -- This assumes that @negate . fromIntegral@ correctly produces - -- @minBound :: Int@ when given its positive 'Word' value as an - -- input. This is true in both 2s-complement and 1s-complement - -- arithmetic, so seems like a safe bet. Tests cover this case, - -- though the CI may not run on sufficiently exotic CPUs. - w2int !n | positive = fromIntegral n - | otherwise = negate $! fromIntegral n - --- | readInteger reads an Integer from the beginning of the ByteString. If --- there is no integer at the beginning of the string, it returns Nothing, --- otherwise it just returns the int read, and the rest of the string. -readInteger :: ByteString -> Maybe (Integer, ByteString) -readInteger as - | null as = Nothing - | otherwise = - case unsafeHead as of - '-' -> first (B.unsafeTail as) >>= \(n, bs) -> return (-n, bs) - '+' -> first (B.unsafeTail as) - _ -> first as - - where first ps | null ps = Nothing - | otherwise = - case B.unsafeHead ps of - w | w >= 0x30 && w <= 0x39 -> Just $ - loop 1 (fromIntegral w - 0x30) [] (B.unsafeTail ps) - | otherwise -> Nothing - - loop :: Int -> Int -> [Integer] - -> ByteString -> (Integer, ByteString) - loop !d !acc ns !ps - | null ps = combine d acc ns empty - | otherwise = - case B.unsafeHead ps of - w | w >= 0x30 && w <= 0x39 -> - if d == 9 then loop 1 (fromIntegral w - 0x30) - (toInteger acc : ns) - (B.unsafeTail ps) - else loop (d+1) - (10*acc + (fromIntegral w - 0x30)) - ns (B.unsafeTail ps) - | otherwise -> combine d acc ns ps - - combine _ acc [] ps = (toInteger acc, ps) - combine d acc ns ps = - (10^d * combine1 1000000000 ns + toInteger acc, ps) - - combine1 _ [n] = n - combine1 b ns = combine1 (b*b) $ combine2 b ns - - combine2 b (n:m:ns) = let !t = m*b + n in t : combine2 b ns - combine2 _ ns = ns - ------------------------------------------------------------------------ -- For non-binary text processing: diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs index 0efb1b7af..eb2d3a7a0 100644 --- a/Data/ByteString/Internal.hs +++ b/Data/ByteString/Internal.hs @@ -92,13 +92,7 @@ module Data.ByteString.Internal ( -- * Exported compatibility shim plusForeignPtr, - unsafeWithForeignPtr, - - -- * Internal constants - intmaxQuot10, - intmaxRem10, - intminQuot10, - intminRem10 + unsafeWithForeignPtr ) where import Prelude hiding (concat, null) @@ -786,22 +780,6 @@ isSpaceChar8 = isSpaceWord8 . c2w overflowError :: String -> a overflowError fun = error $ "Data.ByteString." ++ fun ++ ": size overflow" --- | Bounds for Word# multiplication by 10 without overflow, and --- absolute values of Int bounds. -intmaxWord, intminWord, intmaxQuot10, intmaxRem10, intminQuot10, intminRem10 :: Word -intmaxWord = fromIntegral (maxBound :: Int) -{-# INLINE intmaxWord #-} -intminWord = fromIntegral (negate (minBound :: Int)) -{-# INLINE intminWord #-} -intmaxQuot10 = intmaxWord `quot` 10 -{-# INLINE intmaxQuot10 #-} -intmaxRem10 = intmaxWord `rem` 10 -{-# INLINE intmaxRem10 #-} -intminQuot10 = intminWord `quot` 10 -{-# INLINE intminQuot10 #-} -intminRem10 = intminWord `rem` 10 -{-# INLINE intminRem10 #-} - ------------------------------------------------------------------------ -- | This \"function\" has a superficial similarity to 'System.IO.Unsafe.unsafePerformIO' but diff --git a/Data/ByteString/Lazy/Char8.hs b/Data/ByteString/Lazy/Char8.hs index 97530e51b..c6f0d6a18 100644 --- a/Data/ByteString/Lazy/Char8.hs +++ b/Data/ByteString/Lazy/Char8.hs @@ -1,6 +1,7 @@ {-# LANGUAGE BangPatterns #-} -{-# OPTIONS_HADDOCK prune #-} {-# LANGUAGE Trustworthy #-} +{-# LANGUAGE TypeApplications #-} +{-# OPTIONS_HADDOCK prune #-} -- | -- Module : Data.ByteString.Lazy.Char8 @@ -177,8 +178,28 @@ module Data.ByteString.Lazy.Char8 ( copy, -- * Reading from ByteStrings + -- | Note that a lazy 'ByteString' may hold an unbounded stream of + -- @\'0\'@ digits, in which case the functions below may never return. + -- If that's a concern, you can use 'take' to first truncate the input + -- to an acceptable length. Non-termination is also possible when + -- reading arbitrary precision numbers via 'readInteger' or + -- 'readNatural', if the input is an unbounded stream of arbitrary + -- decimal digits. + -- readInt, + readInt64, + readInt32, + readInt16, + readInt8, + + readWord, + readWord64, + readWord32, + readWord16, + readWord8, + readInteger, + readNatural, -- * I\/O with 'ByteString's -- | ByteString I/O uses binary mode, without any character decoding @@ -223,19 +244,15 @@ import Data.ByteString.Lazy import qualified Data.ByteString.Lazy as L import qualified Data.ByteString as S (ByteString) -- typename only import qualified Data.ByteString as B -import qualified Data.ByteString.Internal as BI import qualified Data.ByteString.Unsafe as B import Data.ByteString.Lazy.Internal +import Data.ByteString.Lazy.ReadInt +import Data.ByteString.Lazy.ReadNat -import Data.ByteString.Internal (c2w,w2c,isSpaceWord8 - ,intmaxQuot10,intmaxRem10 - ,intminQuot10,intminRem10) +import Data.ByteString.Internal (c2w,w2c,isSpaceWord8) import Data.Int (Int64) -import Data.Word import qualified Data.List as List -import Foreign.Ptr (Ptr, plusPtr) -import Foreign.Storable (peek) import Prelude hiding (reverse,head,tail,last,init,null,length,map,lines,foldl,foldr,unlines @@ -912,156 +929,6 @@ unwords :: [ByteString] -> ByteString unwords = intercalate (singleton ' ') {-# INLINE unwords #-} --- | Try to read an 'Int' value from the 'ByteString', returning @Just (val, --- str)@ on success, where @val@ is the value read and @str@ is the rest of the --- input string. If the sequence of digits decodes to a value larger than can --- be represented by an 'Int', the returned value will be 'Nothing'. --- --- Note that a lazy 'ByteString' may, after an optional plus or minus sign, --- consist of an unbounded stream of @0@ digits, in which case 'readInt' --- would diverge (never return). If that's a concern, you can use 'take' to --- obtain a bounded initial segment to pass to 'readInt' instead. --- --- 'readInt' does not ignore leading whitespace, the value must start --- immediately at the beginning of the input stream. --- --- ==== __Examples__ --- >>> readInt "-1729 = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3" --- Just (-1729," = (-10)^3 + (-9)^3 = (-12)^3 + (-1)^3") --- >>> readInt "not a decimal number") --- Nothing --- >>> readInt "12345678901234567890 overflows maxBound") --- Nothing --- >>> readInt "-12345678901234567890 underflows minBound") --- Nothing --- -readInt :: ByteString -> Maybe (Int, ByteString) -{-# INLINABLE readInt #-} -readInt bs = case L.uncons bs of - Just (w, rest) | w - 0x30 <= 9 -> readDec True bs -- starts with digit - | w == 0x2d -> readDec False rest -- starts with minus - | w == 0x2b -> readDec True rest -- starts with plus - _ -> Nothing -- not signed decimal - where - - -- | Read a decimal 'Int' without overflow. The caller has already - -- read any explicit sign (setting @positive@ to 'False' as needed). - -- Here we just deal with the digits. - {-# INLINE readDec #-} - readDec !positive = loop 0 0 - where - loop !nbytes !acc = \ str -> case str of - Empty -> result nbytes acc str - Chunk c cs -> case B.length c of - 0 -> loop nbytes acc cs -- skip empty segment - l -> case accumWord acc c of - (0, !_, !inrange) -- no more digits or overflow - | inrange -> result nbytes acc str - | otherwise -> Nothing - (!n, !a, !inrange) - | not inrange -> Nothing - | n < l -- input not entirely digits - -> result (nbytes + n) a $ Chunk (B.drop n c) cs - | otherwise - -- read more digits from the remaining chunks - -> loop (nbytes + n) a cs - - -- | Process as many digits as we can, returning the additional - -- number of digits found, the updated accumulator, and whether - -- the input decimal did not overflow prior to processing all - -- the provided digits (end of input or non-digit encountered). - accumWord acc (BI.BS fp len) = - BI.accursedUnutterablePerformIO $ - BI.unsafeWithForeignPtr fp $ \ptr -> do - let end = ptr `plusPtr` len - x@(!_, !_, !_) <- if positive - then digits intmaxQuot10 intmaxRem10 end ptr 0 acc - else digits intminQuot10 intminRem10 end ptr 0 acc - return x - where - digits !maxq !maxr !e !ptr = go ptr - where - go :: Ptr Word8 -> Int -> Word -> IO (Int, Word, Bool) - go !p !b !a | p == e = return (b, a, True) - go !p !b !a = do - !w <- fromIntegral <$> peek p - let !d = w - 0x30 - if d > 9 -- No more digits - then return (b, a, True) - else if a < maxq -- Look for more - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else if a > maxq -- overflow - then return (b, a, False) - else if d <= maxr -- Ideally this will be the last digit - then go (p `plusPtr` 1) (b + 1) (a * 10 + d) - else return (b, a, False) -- overflow - - -- | Plausible success, provided we got at least one digit! - result !nbytes !acc str - | nbytes > 0 = let !i = w2int acc in Just (i, str) - | otherwise = Nothing - - -- This assumes that @negate . fromIntegral@ correctly produces - -- @minBound :: Int@ when given its positive 'Word' value as an - -- input. This is true in both 2s-complement and 1s-complement - -- arithmetic, so seems like a safe bet. Tests cover this case, - -- though the CI may not run on sufficiently exotic CPUs. - w2int !n | positive = fromIntegral n - | otherwise = negate $! fromIntegral n - --- | readInteger reads an Integer from the beginning of the ByteString. If --- there is no integer at the beginning of the string, it returns Nothing, --- otherwise it just returns the int read, and the rest of the string. -readInteger :: ByteString -> Maybe (Integer, ByteString) -readInteger Empty = Nothing -readInteger (Chunk c0 cs0) = - case w2c (B.unsafeHead c0) of - '-' -> first (B.unsafeTail c0) cs0 >>= \(n, cs') -> return (-n, cs') - '+' -> first (B.unsafeTail c0) cs0 - _ -> first c0 cs0 - - where first c cs - | B.null c = case cs of - Empty -> Nothing - (Chunk c' cs') -> first' c' cs' - | otherwise = first' c cs - - first' c cs = case B.unsafeHead c of - w | w >= 0x30 && w <= 0x39 -> Just $ - loop 1 (fromIntegral w - 0x30) [] (B.unsafeTail c) cs - | otherwise -> Nothing - - loop :: Int -> Int -> [Integer] - -> S.ByteString -> ByteString -> (Integer, ByteString) - loop !d !acc ns !c cs - | B.null c = case cs of - Empty -> combine d acc ns c cs - (Chunk c' cs') -> loop d acc ns c' cs' - | otherwise = - case B.unsafeHead c of - w | w >= 0x30 && w <= 0x39 -> - if d < 9 then loop (d+1) - (10*acc + (fromIntegral w - 0x30)) - ns (B.unsafeTail c) cs - else loop 1 (fromIntegral w - 0x30) - (fromIntegral acc : ns) - (B.unsafeTail c) cs - | otherwise -> combine d acc ns c cs - - combine _ acc [] c cs = end (fromIntegral acc) c cs - combine d acc ns c cs = - end (10^d * combine1 1000000000 ns + fromIntegral acc) c cs - - combine1 _ [n] = n - combine1 b ns = combine1 (b*b) $ combine2 b ns - - combine2 b (n:m:ns) = let !t = n+m*b in t : combine2 b ns - combine2 _ ns = ns - - end n c cs = let !c' = chunk c cs - in (n, c') - - -- | Write a ByteString to a handle, appending a newline byte -- hPutStrLn :: Handle -> ByteString -> IO () diff --git a/Data/ByteString/Lazy/ReadInt.hs b/Data/ByteString/Lazy/ReadInt.hs new file mode 100644 index 000000000..234bfe9fe --- /dev/null +++ b/Data/ByteString/Lazy/ReadInt.hs @@ -0,0 +1,264 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +-- This file is also included by "Data.ByteString.ReadInt", after defining +-- "BYTESTRING_STRICT". The two modules share much of their code, but +-- the lazy version adds an outer loop over the chunks. + +#ifdef BYTESTRING_STRICT +module Data.ByteString.ReadInt +#else +module Data.ByteString.Lazy.ReadInt +#endif + ( readInt + , readInt8 + , readInt16 + , readInt32 + , readWord + , readWord8 + , readWord16 + , readWord32 + , readInt64 + , readWord64 + ) where + +import qualified Data.ByteString.Internal as BI +#ifdef BYTESTRING_STRICT +import Data.ByteString +#else +import Data.ByteString.Lazy +import Data.ByteString.Lazy.Internal +#endif +import Data.Bits (FiniteBits, isSigned) +import Data.ByteString.Internal (pattern BS, plusForeignPtr) +import Data.Int +import Data.Word +import Foreign.ForeignPtr (ForeignPtr) +import Foreign.Ptr (minusPtr, plusPtr) +import Foreign.Storable (Storable(..)) + +----- Public API + +-- | Try to read a signed 'Int' value from the 'ByteString', returning +-- @Just (val, str)@ on success, where @val@ is the value read and @str@ is the +-- rest of the input string. If the sequence of digits decodes to a value +-- larger than can be represented by an 'Int', the returned value will be +-- 'Nothing'. +-- +-- 'readInt' does not ignore leading whitespace, the value must start +-- immediately at the beginning of the input string. +-- +-- ==== __Examples__ +-- >>> readInt "-1729 sum of cubes" +-- Just (-1729," sum of cubes") +-- >>> readInt "+1: readInt also accepts a leading '+'" +-- Just (1, ": readInt also accepts a leading '+'") +-- >>> readInt "not a decimal number" +-- Nothing +-- >>> readInt "12345678901234567890 overflows maxBound" +-- Nothing +-- >>> readInt "-12345678901234567890 underflows minBound" +-- Nothing +-- +readInt :: ByteString -> Maybe (Int, ByteString) +readInt = _read + +-- | A variant of 'readInt' specialised to 'Int32'. +readInt32 :: ByteString -> Maybe (Int32, ByteString) +readInt32 = _read + +-- | A variant of 'readInt' specialised to 'Int16'. +readInt16 :: ByteString -> Maybe (Int16, ByteString) +readInt16 = _read + +-- | A variant of 'readInt' specialised to 'Int8'. +readInt8 :: ByteString -> Maybe (Int8, ByteString) +readInt8 = _read + +-- | Try to read a 'Word' value from the 'ByteString', returning +-- @Just (val, str)@ on success, where @val@ is the value read and @str@ is the +-- rest of the input string. If the sequence of digits decodes to a value +-- larger than can be represented by a 'Word', the returned value will be +-- 'Nothing'. +-- +-- 'readWord' does not ignore leading whitespace, the value must start with a +-- decimal digit immediately at the beginning of the input string. Leading @+@ +-- signs are not accepted. +-- +-- ==== __Examples__ +-- >>> readWord "1729 sum of cubes" +-- Just (1729," sum of cubes") +-- >>> readWord "+1729 has an explicit sign" +-- Nothing +-- >>> readWord "not a decimal number" +-- Nothing +-- >>> readWord "98765432109876543210 overflows maxBound" +-- Nothing +-- +readWord :: ByteString -> Maybe (Word, ByteString) +readWord = _read + +-- | A variant of 'readWord' specialised to 'Word32'. +readWord32 :: ByteString -> Maybe (Word32, ByteString) +readWord32 = _read + +-- | A variant of 'readWord' specialised to 'Word16'. +readWord16 :: ByteString -> Maybe (Word16, ByteString) +readWord16 = _read + +-- | A variant of 'readWord' specialised to 'Word8'. +readWord8 :: ByteString -> Maybe (Word8, ByteString) +readWord8 = _read + +-- | A variant of 'readInt' specialised to 'Int64'. +readInt64 :: ByteString -> Maybe (Int64, ByteString) +readInt64 = _read + +-- | A variant of 'readWord' specialised to 'Word64'. +readWord64 :: ByteString -> Maybe (Word64, ByteString) +readWord64 = _read + +-- | Polymorphic Int*/Word* reader +_read :: forall a. (Integral a, FiniteBits a, Bounded a) + => ByteString -> Maybe (a, ByteString) +{-# INLINE _read #-} +_read + | isSigned @a 0 + = \ bs -> signed bs >>= \ (r, s, d1) -> _readDecimal r s d1 + | otherwise + -- When the input is @16^n-1@, as is the case with 'maxBound' for + -- all the Word* types, the last decimal digit of 'maxBound' is 5. + = \ bs -> unsigned 5 bs >>= \ (r, s, d1) -> _readDecimal r s d1 + where + -- Returns: + -- * Mod 10 min/max bound remainder + -- * 2nd and later digits + -- * 1st digit + -- + -- When the input is @8*16^n-1@, as is the case with 'maxBound' for + -- all the Int* types, the last decimal digit of 'maxBound' is 7. + -- + signed :: ByteString -> Maybe (Word64, ByteString, Word64) + signed bs = do + (w, s) <- uncons bs + let d1 = fromDigit w + if | d1 <= 9 -> Just (7, s, d1) -- leading digit + | w == 0x2d -> unsigned 8 s -- minus sign + | w == 0x2b -> unsigned 7 s -- plus sign + | otherwise -> Nothing -- not a number + + unsigned :: Word64 -> ByteString -> Maybe (Word64, ByteString, Word64) + unsigned r bs = do + (w, s) <- uncons bs + let d1 = fromDigit w + if | d1 <= 9 -> Just (r, s, d1) -- leading digit + | otherwise -> Nothing -- not a number + +----- Fixed-width unsigned reader + +-- | Intermediate result from scanning a chunk, final output is +-- converted to the requested type once all chunks are processed. +-- +data Result = Overflow + | Result !Int -- number of bytes (digits) read + !Word64 -- accumulator value + +_readDecimal :: forall a. (Integral a, Bounded a) + => Word64 -- ^ abs(maxBound/minBound) `mod` 10 + -> ByteString -- ^ Input string + -> Word64 -- ^ First digit value + -> Maybe (a, ByteString) +{-# INLINE _readDecimal #-} +_readDecimal !r = consume + where + consume :: ByteString -> Word64 -> Maybe (a, ByteString) +#ifdef BYTESTRING_STRICT + consume (BS fp len) a = case _digits q r fp len a of + Result used acc + | used == len + -> convert acc empty + | otherwise + -> convert acc $ BS (fp `plusForeignPtr` used) (len - used) + _ -> Nothing +#else + -- All done + consume Empty acc = convert acc Empty + -- Process next chunk + consume (Chunk (BS fp len) cs) acc + = case _digits q r fp len acc of + Result used acc' + | used == len + -- process remaining chunks + -> consume cs acc' + | otherwise + -- ran into a non-digit + -> convert acc' $ + Chunk (BS (fp `plusForeignPtr` used) (len - used)) cs + _ -> Nothing +#endif + convert :: Word64 -> ByteString -> Maybe (a, ByteString) + convert !acc rest = + let !i = case r of + -- minBound @Int* `mod` 10 == 8 + 8 -> negate $ fromIntegral @Word64 @a acc + _ -> fromIntegral @Word64 @a acc + in Just (i, rest) + + -- The quotient of 'maxBound' divided by 10 is needed for + -- overflow checks, once the accumulator exceeds this value + -- no further digits can be added. If equal, the last digit + -- must not exceed the `r` value (max/min bound `mod` 10). + -- + q = fromIntegral @a @Word64 maxBound `div` 10 + +----- Per chunk decoder + +-- | Process as many digits as we can, returning the additional +-- number of digits found and the updated accumulator. If the +-- accumulator would overflow return 'Overflow'. +-- +_digits :: Word64 -- ^ maximum non-overflow value `div` 10 + -> Word64 -- ^ maximum non-overflow vavlue `mod` 10 + -> ForeignPtr Word8 -- ^ Input buffer + -> Int -- ^ Input length + -> Word64 -- ^ Accumulated value of leading digits + -> Result -- ^ Bytes read and final accumulator, + -- or else overflow indication +{-# INLINE _digits #-} +_digits !q !r fp len a = BI.accursedUnutterablePerformIO $ + BI.unsafeWithForeignPtr fp $ \ ptr -> do + let end = ptr `plusPtr` len + go ptr end ptr a + where + go !start !end = loop + where + loop !ptr !acc = getDigit >>= \ !d -> + if | d > 9 + -> return $ Result (ptr `minusPtr` start) acc + | acc < q || acc == q && d <= r + -> loop (ptr `plusPtr` 1) (acc * 10 + d) + | otherwise + -> return Overflow + where + getDigit :: IO Word64 + getDigit + | ptr /= end = fromDigit <$> peek ptr + | otherwise = pure 10 -- End of input + {-# NOINLINE getDigit #-} + -- 'getDigit' makes it possible to implement a single success + -- exit point from the loop. If instead we return 'Result' + -- from multiple places, when '_digits' is inlined we get (at + -- least GHC 8.10 through 9.2) for each exit path a separate + -- join point implementing the continuation code. GHC ticket + -- . + -- + -- The NOINLINE pragma is required to avoid inlining branches + -- that would restore multiple exit points. + +fromDigit :: Word8 -> Word64 +{-# INLINE fromDigit #-} +fromDigit = \ !w -> fromIntegral w - 0x30 -- i.e. w - '0' diff --git a/Data/ByteString/Lazy/ReadNat.hs b/Data/ByteString/Lazy/ReadNat.hs new file mode 100644 index 000000000..3a6030fcc --- /dev/null +++ b/Data/ByteString/Lazy/ReadNat.hs @@ -0,0 +1,257 @@ +{-# LANGUAGE CPP #-} +{-# LANGUAGE BangPatterns #-} +{-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE PatternSynonyms #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +-- This file is included by "Data.ByteString.ReadInt", after defining +-- "BYTESTRING_STRICT". The two modules are largely identical, except for the +-- choice of ByteString type and the loops in `readNatural`, where the lazy +-- version needs to nest the inner loop inside a loop over the constituent +-- chunks. + +#ifdef BYTESTRING_STRICT +module Data.ByteString.ReadNat +#else +module Data.ByteString.Lazy.ReadNat +#endif + ( readInteger + , readNatural + ) where + +import qualified Data.ByteString.Internal as BI +#ifdef BYTESTRING_STRICT +import Data.ByteString +#else +import Data.ByteString.Lazy +import Data.ByteString.Lazy.Internal +#endif +import Data.Bits (finiteBitSize) +import Data.ByteString.Internal (pattern BS, plusForeignPtr) +import Data.Word +import Foreign.ForeignPtr (ForeignPtr) +import Foreign.Ptr (Ptr, minusPtr, plusPtr) +import Foreign.Storable (Storable(..)) +import Numeric.Natural (Natural) + +----- Public API + +-- | 'readInteger' reads an 'Integer' from the beginning of the 'ByteString'. +-- If there is no 'Integer' at the beginning of the string, it returns +-- 'Nothing', otherwise it just returns the 'Integer' read, and the rest of +-- the string. +-- +-- 'readInteger' does not ignore leading whitespace, the value must start +-- immediately at the beginning of the input string. +-- +-- ==== __Examples__ +-- >>> readInteger "-000111222333444555666777888999 all done" +-- Just (-111222333444555666777888999," all done") +-- >>> readInteger "+1: readInteger also accepts a leading '+'" +-- Just (1, ": readInteger also accepts a leading '+'") +-- >>> readInteger "not a decimal number" +-- Nothing +-- +readInteger :: ByteString -> Maybe (Integer, ByteString) +readInteger = \ bs -> do + (w, s) <- uncons bs + let d = fromDigit w + if | d <= 9 -> unsigned d s -- leading digit + | w == 0x2d -> negative s -- minus sign + | w == 0x2b -> positive s -- plus sign + | otherwise -> Nothing -- not a number + where + unsigned :: Word -> ByteString -> Maybe (Integer, ByteString) + unsigned d s = + let (!n, rest) = _readDecimal d s + !i = toInteger n + in Just (i, rest) + + positive :: ByteString -> Maybe (Integer, ByteString) + positive bs = do + (w, s) <- uncons bs + let d = fromDigit w + if | d <= 9 -> unsigned d s + | otherwise -> Nothing + + negative :: ByteString -> Maybe (Integer, ByteString) + negative bs = do + (w, s) <- uncons bs + let d = fromDigit w + if | d > 9 -> Nothing + | otherwise -> let (n, rest) = _readDecimal d s + !i = negate $ toInteger n + in Just (i, rest) + +-- | 'readNatural' reads a 'Natural' number from the beginning of the +-- 'ByteString'. If there is no 'Natural' number at the beginning of the +-- string, it returns 'Nothing', otherwise it just returns the number read, and +-- the rest of the string. +-- +-- 'readNatural' does not ignore leading whitespace, the value must start with +-- a decimal digit immediately at the beginning of the input string. Leading +-- @+@ signs are not accepted. +-- +-- ==== __Examples__ +-- >>> readNatural "000111222333444555666777888999 all done" +-- Just (111222333444555666777888999," all done") +-- >>> readNatural "+000111222333444555666777888999 explicit sign" +-- Nothing +-- >>> readNatural "not a decimal number" +-- Nothing +-- +readNatural :: ByteString -> Maybe (Natural, ByteString) +readNatural bs = do + (w, s) <- uncons bs + let d = fromDigit w + if | d <= 9 -> Just $! _readDecimal d s + | otherwise -> Nothing + +----- Internal implementation + +-- | Intermediate result from scanning a chunk, final output is +-- obtained via `convert` after all the chunks are processed. +-- +data Result = Result !Int -- Bytes consumed + !Word -- Value of LSW + !Int -- Digits in LSW + [Natural] -- Little endian MSW list + +_readDecimal :: Word -> ByteString -> (Natural, ByteString) +_readDecimal = + -- Having read one digit, we're about to read the 2nd So the digit count + -- up to 'safeLog' starts at 2. + consume [] 2 + where + consume :: [Natural] -> Int -> Word -> ByteString + -> (Natural, ByteString) +#ifdef BYTESTRING_STRICT + consume ns cnt acc (BS fp len) = + -- Having read one digit, we're about to read the 2nd + -- So the digit count up to 'safeLog' starts at 2. + case natdigits fp len acc cnt ns of + Result used acc' cnt' ns' + | used == len + -> convert acc' cnt' ns' $ empty + | otherwise + -> convert acc' cnt' ns' $ + BS (fp `plusForeignPtr` used) (len - used) +#else + -- All done + consume ns cnt acc Empty = convert acc cnt ns Empty + -- Process next chunk + consume ns cnt acc (Chunk (BS fp len) cs) + = case natdigits fp len acc cnt ns of + Result used acc' cnt' ns' + | used == len -- process more chunks + -> consume ns' cnt' acc' cs + | otherwise -- ran into a non-digit + -> let c = Chunk (BS (fp `plusForeignPtr` used) (len - used)) cs + in convert acc' cnt' ns' c +#endif + convert !acc !cnt !ns rest = + let !n = combine acc cnt ns + in (n, rest) + + -- | Merge least-significant word with reduction of of little-endian tail. + -- + -- The input is: + -- + -- * Least significant digits as a 'Word' (LSW) + -- * The number of digits that went into the LSW + -- * All the remaining digit groups ('safeLog' digits each), + -- in little-endian order + -- + -- The result is obtained by pairwise recursive combining of all the + -- full size digit groups, followed by multiplication by @10^cnt@ and + -- addition of the LSW. + combine :: Word -- ^ value of LSW + -> Int -- ^ count of digits in LSW + -> [Natural] -- ^ tail elements (base @10^'safeLog'@) + -> Natural + {-# INLINE combine #-} + combine !acc !_ [] = wordToNatural acc + combine !acc !cnt ns = + wordToNatural (10^cnt) * combine1 safeBase ns + wordToNatural acc + + -- | Recursive reduction of little-endian sequence of 'Natural'-valued + -- /digits/ in base @base@ (a power of 10). The base is squared after + -- each round. This shows better asymptotic performance than one word + -- at a time multiply-add folds. See: + -- + -- + combine1 :: Natural -> [Natural] -> Natural + combine1 _ [n] = n + combine1 base ns = combine1 (base * base) (combine2 base ns) + + -- | One round pairwise merge of numbers in base @base@. + combine2 :: Natural -> [Natural] -> [Natural] + combine2 base (n:m:ns) = let !t = m * base + n in t : combine2 base ns + combine2 _ ns = ns + +-- The intermediate representation is a little-endian sequence in base +-- @10^'safeLog'@, prefixed by an initial element in base @10^cnt@ for some +-- @cnt@ between 1 and 'safeLog'. The final result is obtained by recursive +-- pairwise merging of the tail followed by a final multiplication by @10^cnt@ +-- and addition of the head. +-- +natdigits :: ForeignPtr Word8 -- ^ Input chunk + -> Int -- ^ Chunk length + -> Word -- ^ accumulated element + -> Int -- ^ partial digit count + -> [Natural] -- ^ accumulated MSB elements + -> Result +{-# INLINE natdigits #-} +natdigits fp len = \ acc cnt ns -> + BI.accursedUnutterablePerformIO $ + BI.unsafeWithForeignPtr fp $ \ ptr -> do + let end = ptr `plusPtr` len + go ptr end acc cnt ns ptr + where + go !start !end = loop + where + loop :: Word -> Int -> [Natural] -> Ptr Word8 -> IO Result + loop !acc !cnt ns !ptr = getDigit >>= \ !d -> + if | d > 9 + -> return $ Result (ptr `minusPtr` start) acc cnt ns + | cnt < safeLog + -> loop (10*acc + d) (cnt+1) ns $ ptr `plusPtr` 1 + | otherwise + -> let !acc' = wordToNatural acc + in loop d 1 (acc' : ns) $ ptr `plusPtr` 1 + where + getDigit | ptr /= end = fromDigit <$> peek ptr + | otherwise = pure 10 -- End of input + {-# NOINLINE getDigit #-} + -- 'getDigit' makes it possible to implement a single success + -- exit point from the loop. If instead we return 'Result' + -- from multiple places, when 'natdigits' is inlined we get (at + -- least GHC 8.10 through 9.2) for each exit path a separate + -- join point implementing the continuation code. GHC ticket + -- . + -- + -- The NOINLINE pragma is required to avoid inlining branches + -- that would restore multiple exit points. + +----- Misc functions + +-- | Largest decimal digit count that never overflows the accumulator +-- The base 10 logarithm of 2 is ~0.30103, therefore 2^n has at least +-- @1 + floor (0.3 n)@ decimal digits. Therefore @floor (0.3 n)@, +-- digits cannot overflow the upper bound of an @n-bit@ word. +-- +safeLog :: Int +safeLog = 3 * finiteBitSize @Word 0 `div` 10 + +-- | 10-power base for little-endian sequence of ~Word-sized "digits" +safeBase :: Natural +safeBase = 10 ^ safeLog + +fromDigit :: Word8 -> Word +{-# INLINE fromDigit #-} +fromDigit = \ !w -> fromIntegral w - 0x30 -- i.e. w - '0' + +wordToNatural :: Word -> Natural +{-# INLINE wordToNatural #-} +wordToNatural = fromIntegral diff --git a/Data/ByteString/ReadInt.hs b/Data/ByteString/ReadInt.hs new file mode 100644 index 000000000..19655ebb8 --- /dev/null +++ b/Data/ByteString/ReadInt.hs @@ -0,0 +1,3 @@ +{-# LANGUAGE CPP #-} +#define BYTESTRING_STRICT +#include "Lazy/ReadInt.hs" diff --git a/Data/ByteString/ReadNat.hs b/Data/ByteString/ReadNat.hs new file mode 100644 index 000000000..4997dddc8 --- /dev/null +++ b/Data/ByteString/ReadNat.hs @@ -0,0 +1,3 @@ +{-# LANGUAGE CPP #-} +#define BYTESTRING_STRICT +#include "Lazy/ReadNat.hs" diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index c6572f6bb..b56c85c61 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -42,6 +42,7 @@ import BenchBoundsCheckFusion import BenchCount import BenchCSV import BenchIndices +import BenchReadInt ------------------------------------------------------------------------------ -- Benchmark support @@ -467,4 +468,5 @@ main = do , benchCount , benchCSV , benchIndices + , benchReadInt ] diff --git a/bench/BenchReadInt.hs b/bench/BenchReadInt.hs new file mode 100644 index 000000000..32d6bc1b2 --- /dev/null +++ b/bench/BenchReadInt.hs @@ -0,0 +1,144 @@ +-- | +-- Copyright : (c) 2021 Viktor Dukhovni +-- License : BSD3-style (see LICENSE) +-- +-- Maintainer : Viktor Dukhovni +-- +-- Benchmark readInt and variants, readWord and variants, +-- readInteger and readNatural + +{-# LANGUAGE + CPP + , BangPatterns + , OverloadedStrings + , TypeApplications + , ScopedTypeVariables + #-} + +module BenchReadInt (benchReadInt) where + +import qualified Data.ByteString.Builder as B +import qualified Data.ByteString.Char8 as S +import qualified Data.ByteString.Lazy.Char8 as L +import Test.Tasty.Bench +import Data.Int +import Data.Word +import Numeric.Natural +#if !(MIN_VERSION_base(4,11,0)) +import Data.Semigroup (Semigroup((<>))) +#endif +import Data.Monoid (mconcat) + +------------------------------------------------------------------------------ +-- Benchmark +------------------------------------------------------------------------------ + +-- Sum space-separated integers in a ByteString. +loopS :: Integral a + => (S.ByteString -> Maybe (a, S.ByteString)) -> S.ByteString -> a +loopS rd = go 0 + where + go !acc !bs = case rd bs of + Just (i, t) -> case S.uncons t of + Just (_, t') -> go (acc + i) t' + Nothing -> acc + i + Nothing -> acc + +-- Sum space-separated integers in a ByteString. +loopL :: Integral a + => (L.ByteString -> Maybe (a, L.ByteString)) -> L.ByteString -> a +loopL rd = go 0 + where + go !acc !bs = case rd bs of + Just (i, t) -> case L.uncons t of + Just (_, t') -> go (acc + i) t' + Nothing -> acc + i + Nothing -> acc + +benchReadInt :: Benchmark +benchReadInt = bgroup "Read Integral" + [ bgroup "Strict" + [ bench "ReadInt" $ nf (loopS S.readInt) intS + , bench "ReadInt8" $ nf (loopS S.readInt8) int8S + , bench "ReadInt16" $ nf (loopS S.readInt16) int16S + , bench "ReadInt32" $ nf (loopS S.readInt32) int32S + , bench "ReadInt64" $ nf (loopS S.readInt64) int64S + , bench "ReadWord" $ nf (loopS S.readWord) wordS + , bench "ReadWord8" $ nf (loopS S.readWord8) word8S + , bench "ReadWord16" $ nf (loopS S.readWord16) word16S + , bench "ReadWord32" $ nf (loopS S.readWord32) word32S + , bench "ReadWord64" $ nf (loopS S.readWord64) word64S + , bench "ReadInteger" $ nf (loopS S.readInteger) bignatS + , bench "ReadNatural" $ nf (loopS S.readNatural) bignatS + , bench "ReadInteger small" $ nf (loopS S.readInteger) intS + , bench "ReadNatural small" $ nf (loopS S.readNatural) wordS + ] + + , bgroup "Lazy" + [ bench "ReadInt" $ nf (loopL L.readInt) intL + , bench "ReadInt8" $ nf (loopL L.readInt8) int8L + , bench "ReadInt16" $ nf (loopL L.readInt16) int16L + , bench "ReadInt32" $ nf (loopL L.readInt32) int32L + , bench "ReadInt64" $ nf (loopL L.readInt64) int64L + , bench "ReadWord" $ nf (loopL L.readWord) wordL + , bench "ReadWord8" $ nf (loopL L.readWord8) word8L + , bench "ReadWord16" $ nf (loopL L.readWord16) word16L + , bench "ReadWord32" $ nf (loopL L.readWord32) word32L + , bench "ReadWord64" $ nf (loopL L.readWord64) word64L + , bench "ReadInteger" $ nf (loopL L.readInteger) bignatL + , bench "ReadNatural" $ nf (loopL L.readNatural) bignatL + , bench "ReadInteger small" $ nf (loopL L.readInteger) intL + , bench "ReadNatural small" $ nf (loopL L.readNatural) wordL + ] + ] + where + mkWordL :: forall a. (Integral a, Bounded a) + => (a -> B.Builder) -> L.ByteString + mkWordL f = B.toLazyByteString b + where b = mconcat [f i <> B.char8 ' ' | i <- [n-255..n]] + n = maxBound @a + mkWordS f = S.toStrict $ mkWordL f + + mkIntL :: forall a. (Integral a, Bounded a) + => (a -> B.Builder) -> L.ByteString + mkIntL f = B.toLazyByteString b + where b = mconcat [f (i + 128) <> B.char8 ' ' | i <- [n-255..n]] + n = maxBound @a + mkIntS f = S.toStrict $ mkIntL f + + wordS, word8S, word16S, word32S, word64S :: S.ByteString + !wordS = mkWordS B.wordDec + !word8S = mkWordS B.word8Dec + !word16S = mkWordS B.word16Dec + !word32S = mkWordS B.word32Dec + !word64S = mkWordS B.word64Dec + + intS, int8S, int16S, int32S, int64S :: S.ByteString + !intS = mkIntS B.intDec + !int8S = mkIntS B.int8Dec + !int16S = mkIntS B.int16Dec + !int32S = mkIntS B.int32Dec + !int64S = mkIntS B.int64Dec + + word8L, word16L, word32L, word64L :: L.ByteString + !wordL = mkWordL B.wordDec + !word8L = mkWordL B.word8Dec + !word16L = mkWordL B.word16Dec + !word32L = mkWordL B.word32Dec + !word64L = mkWordL B.word64Dec + + intL, int8L, int16L, int32L, int64L :: L.ByteString + !intL = mkIntL B.intDec + !int8L = mkIntL B.int8Dec + !int16L = mkIntL B.int16Dec + !int32L = mkIntL B.int32Dec + !int64L = mkIntL B.int64Dec + + bignatL :: L.ByteString + !bignatL = B.toLazyByteString b + where b = mconcat [B.integerDec (powpow i) <> B.char8 ' ' | i <- [0..13]] + powpow :: Word -> Integer + powpow n = 2^(2^n :: Word) + + bignatS :: S.ByteString + !bignatS = S.toStrict bignatL diff --git a/bytestring.cabal b/bytestring.cabal index ea9fd9d70..262287626 100644 --- a/bytestring.cabal +++ b/bytestring.cabal @@ -100,6 +100,10 @@ library Data.ByteString.Builder.RealFloat.Internal Data.ByteString.Builder.RealFloat.TableGenerator Data.ByteString.Lazy.Internal.Deque + Data.ByteString.Lazy.ReadInt + Data.ByteString.Lazy.ReadNat + Data.ByteString.ReadInt + Data.ByteString.ReadNat default-language: Haskell2010 other-extensions: CPP, @@ -176,6 +180,7 @@ benchmark bytestring-bench BenchCount BenchCSV BenchIndices + BenchReadInt type: exitcode-stdio-1.0 hs-source-dirs: bench default-language: Haskell2010 diff --git a/tests/Properties.hs b/tests/Properties.hs index 3492c3ca8..3dc423a3c 100644 --- a/tests/Properties.hs +++ b/tests/Properties.hs @@ -1,7 +1,16 @@ +{-# LANGUAGE AllowAmbiguousTypes #-} {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MagicHash #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} {-# LANGUAGE UnboxedTuples #-} +-- We need @AllowAmbiguousTypes@ in order to be able to use @TypeApplications@ +-- to disambiguate the desired instance of class methods whose instance cannot +-- be inferred from the caller's context. We would otherwise have to use +-- proxy arguments. Here the 'RdInt' class methods used to generate tests for +-- all the various 'readInt' types require explicit type applications. + module Properties (testSuite) where import Foreign.C.String (withCString) @@ -21,7 +30,7 @@ import qualified Data.List as List import Data.Char import Data.Word import Data.Maybe -import Data.Int (Int64) +import Data.Int (Int8, Int16, Int32, Int64) import Data.Semigroup import GHC.Exts (Int(..), newPinnedByteArray#, unsafeFreezeByteArray#) import GHC.ST (ST(..), runST) @@ -82,36 +91,54 @@ prop_lines_lazy2 = prop_strip x = C.strip x == (C.dropSpace . C.reverse . C.dropSpace . C.reverse) x --- Ensure that readInt and readInteger over lazy ByteStrings are not +class (Bounded a, Integral a, Show a) => RdInt a where + rdIntC :: C.ByteString -> Maybe (a, C.ByteString) + rdIntD :: D.ByteString -> Maybe (a, D.ByteString) + +instance RdInt Int where { rdIntC = C.readInt; rdIntD = D.readInt } +instance RdInt Int8 where { rdIntC = C.readInt8; rdIntD = D.readInt8 } +instance RdInt Int16 where { rdIntC = C.readInt16; rdIntD = D.readInt16 } +instance RdInt Int32 where { rdIntC = C.readInt32; rdIntD = D.readInt32 } +instance RdInt Int64 where { rdIntC = C.readInt64; rdIntD = D.readInt64 } +-- +instance RdInt Word where { rdIntC = C.readWord; rdIntD = D.readWord } +instance RdInt Word8 where { rdIntC = C.readWord8; rdIntD = D.readWord8 } +instance RdInt Word16 where { rdIntC = C.readWord16; rdIntD = D.readWord16 } +instance RdInt Word32 where { rdIntC = C.readWord32; rdIntD = D.readWord32 } +instance RdInt Word64 where { rdIntC = C.readWord64; rdIntD = D.readWord64 } + +smax :: forall a. (Bounded a, Show a) => String +smax = show $ maxBound @a +smax1 :: forall a. (Bounded a, Integral a) => String +smax1 = show $ fromIntegral @a @Integer maxBound + 1 +smax10 :: forall a. (Bounded a, Integral a) => String +smax10 = show $ fromIntegral @a @Integer maxBound + 10 + +smin :: forall a. (Bounded a, Show a) => String +smin = show (minBound @a) +smin1 :: forall a. (Bounded a, Integral a) => String +smin1 = show $ fromIntegral @a @Integer minBound - 1 +smin10 :: forall a. (Bounded a, Integral a) => String +smin10 = show $ fromIntegral @a @Integer minBound - 10 + +-- Ensure that readWord64 and readInteger over lazy ByteStrings are not -- excessively strict. -prop_readIntSafe = (fst . fromJust . D.readInt) (Chunk (C.pack "1z") Empty) == 1 -prop_readIntUnsafe = (fst . fromJust . D.readInt) (Chunk (C.pack "2z") undefined) == 2 +prop_readWordSafe = (fst . fromJust . D.readWord64) (Chunk (C.pack "1z") Empty) == 1 +prop_readWordUnsafe = (fst . fromJust . D.readWord64) (Chunk (C.pack "2z") undefined) == 2 prop_readIntegerSafe = (fst . fromJust . D.readInteger) (Chunk (C.pack "1z") Empty) == 1 prop_readIntegerUnsafe = (fst . fromJust . D.readInteger) (Chunk (C.pack "2z") undefined) == 2 -prop_readIntBoundsCC = let !smax = show (maxBound :: Int) - !smin = show (minBound :: Int) - !smax1 = show (fromIntegral (maxBound :: Int) + 1 :: Integer) - !smin1 = show (fromIntegral (minBound :: Int) - 1 :: Integer) - !smax10 = show (fromIntegral (maxBound :: Int) + 10 :: Integer) - !smin10 = show (fromIntegral (minBound :: Int) - 10 :: Integer) - -- - in C.readInt (spack smax) == good maxBound - && C.readInt (spack smin) == good minBound - -- - && C.readInt (spackPlus smax) == good maxBound - && C.readInt (spackMinus smax) == good (negate maxBound) - -- - && C.readInt (spackZeros smax) == good maxBound - && C.readInt (spackZeros smin) == good minBound - -- - && C.readInt (spack smax1 ) == Nothing - && C.readInt (spack smin1 ) == Nothing - -- - && C.readInt (spack smax10) == Nothing - && C.readInt (spack smin10) == Nothing - -- - && C.readInt (spackLong smax) == Nothing - && C.readInt (spackLong smin) == Nothing +prop_readNaturalSafe = (fst . fromJust . D.readNatural) (Chunk (C.pack "1z") Empty) == 1 +prop_readNaturalUnsafe = (fst . fromJust . D.readNatural) (Chunk (C.pack "2z") undefined) == 2 +prop_readIntBoundsCC = rdWordBounds @Word + && rdWordBounds @Word8 + && rdWordBounds @Word16 + && rdWordBounds @Word32 + && rdWordBounds @Word64 + && rdIntBounds @Int + && rdIntBounds @Int8 + && rdIntBounds @Int16 + && rdIntBounds @Int32 + && rdIntBounds @Int64 where tailStr = " tail" zeroStr = "000000000000000000000000000" @@ -124,36 +151,47 @@ prop_readIntBoundsCC = let !smax = show (maxBound :: Int) '-':num -> C.pack $ '-' : zeroStr ++ num ++ tailStr num -> C.pack $ zeroStr ++ num ++ tailStr good i = Just (i, C.pack tailStr) -prop_readIntBoundsLC = let !smax = show (maxBound :: Int) - !smin = show (minBound :: Int) - !smax1 = show (fromIntegral (maxBound :: Int) + 1 :: Integer) - !smin1 = show (fromIntegral (minBound :: Int) - 1 :: Integer) - !smax10 = show (fromIntegral (maxBound :: Int) + 10 :: Integer) - !smin10 = show (fromIntegral (minBound :: Int) - 10 :: Integer) - -- Plain min/maxBound - in LC.readInt (spack smax) == good maxBound - && LC.readInt (spack smin) == good minBound - -- With explicit [+-] sign for maxBound - && LC.readInt (spackPlus smax) == good maxBound - && LC.readInt (spackMinus smax) == good (negate maxBound) - -- With leading zeros - && LC.readInt (spackZeros smax) == good maxBound - && LC.readInt (spackZeros smin) == good minBound - -- Overflow in last digit - && LC.readInt (spack smax1 ) == Nothing - && LC.readInt (spack smin1 ) == Nothing - -- Overflow in 2nd-last digit - && LC.readInt (spack smax10) == Nothing - && LC.readInt (spack smin10) == Nothing - -- Overflow across chunk boundary - && LC.readInt (spackLong1 smax) == Nothing - && LC.readInt (spackLong1 smin) == Nothing - -- Overflow within chunk - && LC.readInt (spackLong2 smax) == Nothing - && LC.readInt (spackLong2 smin) == Nothing - -- Sign with no digits - && LC.readInt (LC.pack "+ foo") == Nothing - && LC.readInt (LC.pack "-bar") == Nothing + -- + rdWordBounds :: forall a. RdInt a => Bool + rdWordBounds = + -- Upper bound + rdIntC @a (spack (smax @a)) == good maxBound + -- With leading zeros + && rdIntC @a (spackZeros (smax @a)) == good maxBound + -- Overflow in last digit + && rdIntC @a (spack (smax1 @a)) == Nothing + -- Overflow in 2nd-last digit + && rdIntC @a (spack (smax10 @a)) == Nothing + -- Trailing zeros + && rdIntC @a (spackLong (smax @a)) == Nothing + -- + rdIntBounds :: forall a. RdInt a => Bool + rdIntBounds = + rdWordBounds @a + -- Lower bound + && rdIntC @a (spack (smin @a)) == good minBound + -- With leading signs + && rdIntC @a (spackPlus (smax @a)) == good maxBound + && rdIntC @a (spackMinus (smax @a)) == good (negate maxBound) + -- With leading zeros + && rdIntC @a (spackZeros (smax @a)) == good maxBound + -- Underflow in last digit + && rdIntC @a (spack (smin1 @a)) == Nothing + -- Underflow in 2nd-last digit + && rdIntC @a (spack (smin10 @a)) == Nothing + -- Trailing zeros + && rdIntC @a (spackLong (smin @a)) == Nothing + +prop_readIntBoundsLC = rdWordBounds @Word + && rdWordBounds @Word8 + && rdWordBounds @Word16 + && rdWordBounds @Word32 + && rdWordBounds @Word64 + && rdIntBounds @Int + && rdIntBounds @Int8 + && rdIntBounds @Int16 + && rdIntBounds @Int32 + && rdIntBounds @Int64 where tailStr = " tail" zeroStr = "000000000000000000000000000" @@ -167,6 +205,43 @@ prop_readIntBoundsLC = let !smax = show (maxBound :: Int) '-':num -> LC.pack ('-' : zeroStr) `D.append` LC.pack (num ++ tailStr) num -> LC.pack $ zeroStr ++ num ++ tailStr good i = Just (i, LC.pack tailStr) + -- + rdWordBounds :: forall a. RdInt a => Bool + rdWordBounds = + -- Upper bound + rdIntD @a (spack (smax @a)) == good maxBound + -- With leading zeros + && rdIntD @a (spackZeros (smax @a)) == good maxBound + -- Overflow in last digit + && rdIntD @a (spack (smax1 @a)) == Nothing + -- Overflow in 2nd-last digit + && rdIntD @a (spack (smax10 @a)) == Nothing + -- Overflow across chunk boundary + && rdIntD @a (spackLong1 (smax @a)) == Nothing + -- Overflow within chunk + && rdIntD @a (spackLong2 (smax @a)) == Nothing + -- Sign with no digits + && rdIntD @a (LC.pack "+ foo") == Nothing + && rdIntD @a (LC.pack "-bar") == Nothing + -- + rdIntBounds :: forall a. RdInt a => Bool + rdIntBounds = + rdWordBounds @a + -- Lower bound + && rdIntD @a (spack (smin @a)) == good minBound + -- With leading signs + && rdIntD @a (spackPlus (smax @a)) == good maxBound + && rdIntD @a (spackMinus (smax @a)) == good (negate maxBound) + -- With leading zeros + && rdIntD @a (spackZeros (smin @a)) == good minBound + -- Overflow in last digit + && rdIntD @a (spack (smin1 @a)) == Nothing + -- Overflow in 2nd-last digit + && rdIntD @a (spack (smin10 @a)) == Nothing + -- Overflow across chunk boundary + && rdIntD @a (spackLong1 (smin @a)) == Nothing + -- Overflow within chunk + && rdIntD @a (spackLong2 (smin @a)) == Nothing ------------------------------------------------------------------------ @@ -536,12 +611,14 @@ misc_tests = , testProperty "strip" prop_strip , testProperty "isSpace" prop_isSpaceWord8 - , testProperty "readIntSafe" prop_readIntSafe - , testProperty "readIntUnsafe" prop_readIntUnsafe + , testProperty "readWordSafe" prop_readWordSafe + , testProperty "readWordUnsafe" prop_readWordUnsafe , testProperty "readIntBoundsCC" prop_readIntBoundsCC , testProperty "readIntBoundsLC" prop_readIntBoundsLC , testProperty "readIntegerSafe" prop_readIntegerSafe , testProperty "readIntegerUnsafe" prop_readIntegerUnsafe + , testProperty "readNaturalSafe" prop_readNaturalSafe + , testProperty "readNaturalUnsafe" prop_readNaturalUnsafe ] strictness_checks = diff --git a/tests/Properties/ByteString.hs b/tests/Properties/ByteString.hs index d87c9333c..a07c62bbc 100644 --- a/tests/Properties/ByteString.hs +++ b/tests/Properties/ByteString.hs @@ -4,7 +4,16 @@ -- License : BSD-style {-# LANGUAGE CPP #-} +{-# LANGUAGE AllowAmbiguousTypes #-} {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE ScopedTypeVariables #-} +{-# LANGUAGE TypeApplications #-} + +-- We need @AllowAmbiguousTypes@ in order to be able to use @TypeApplications@ +-- to disambiguate the desired instance of class methods whose instance cannot +-- be inferred from the caller's context. We would otherwise have to use +-- proxy arguments. Here the 'RdInt' class methods used to generate tests for +-- all the various 'readInt' types require explicit type applications. -- We are happy to sacrifice optimizations in exchange for faster compilation, -- but need to test rewrite rules. As one can check using -ddump-rule-firings, @@ -30,8 +39,6 @@ import qualified Data.ByteString.Lazy as B import qualified Data.ByteString.Lazy.Internal as B (invariant) #endif -import Data.Word - #else #ifndef BYTESTRING_LAZY @@ -43,6 +50,9 @@ import qualified Data.ByteString.Lazy.Char8 as B import qualified Data.ByteString.Lazy.Internal as B (invariant) #endif +import Data.Int +import Numeric.Natural (Natural) + import Text.Read #endif @@ -54,6 +64,7 @@ import qualified Data.List as List import Data.Semigroup import Data.String import Data.Tuple +import Data.Word import Test.Tasty import Test.Tasty.QuickCheck import QuickCheckUtils @@ -64,6 +75,42 @@ toElem = id #else toElem :: Char8 -> Char toElem (Char8 c) = c + +class (Integral a, Show a) => RdInt a where + bread :: B.ByteString -> Maybe (a, B.ByteString) + sread :: String -> Maybe (a, String) + +instance RdInt Int where { bread = B.readInt; sread = readInt } +instance RdInt Int8 where { bread = B.readInt8; sread = readInt8 } +instance RdInt Int16 where { bread = B.readInt16; sread = readInt16 } +instance RdInt Int32 where { bread = B.readInt32; sread = readInt32 } +instance RdInt Int64 where { bread = B.readInt64; sread = readInt64 } +-- +instance RdInt Word where { bread = B.readWord; sread = readWord } +instance RdInt Word8 where { bread = B.readWord8; sread = readWord8 } +instance RdInt Word16 where { bread = B.readWord16; sread = readWord16 } +instance RdInt Word32 where { bread = B.readWord32; sread = readWord32 } +instance RdInt Word64 where { bread = B.readWord64; sread = readWord64 } +-- +instance RdInt Integer where { bread = B.readInteger; sread = readInteger } +instance RdInt Natural where { bread = B.readNatural; sread = readNatural } + +instance Arbitrary Natural where + arbitrary = i2n <$> arbitrary + where i2n :: Integer -> Natural + i2n i | i >= 0 = fromIntegral i + | otherwise = fromIntegral $ negate i + +testRdInt :: forall a. (Arbitrary a, RdInt a) => String -> TestTree +testRdInt s = testGroup s $ + [ testProperty "from string" $ \ prefix value suffix -> + let si = show @a value + b = prefix <> B.pack si <> suffix + in fmap (second B.unpack) (bread @a b) + === sread @a (B.unpack prefix ++ si ++ B.unpack suffix) + , testProperty "from number" $ \n -> + bread @a (B.pack (show n)) === Just (n, B.empty) + ] #endif tests :: [TestTree] @@ -520,14 +567,18 @@ tests = #ifdef BYTESTRING_CHAR8 , testProperty "isString" $ \x -> x === fromString (B.unpack x) - , testProperty "readInt 1" $ - \x -> fmap (second B.unpack) (B.readInt x) === readInt (B.unpack x) - , testProperty "readInt 2" $ - \n -> B.readInt (B.pack (show n)) === Just (n, B.empty) - , testProperty "readInteger 1" $ - \x -> fmap (second B.unpack) (B.readInteger x) === readInteger (B.unpack x) - , testProperty "readInteger 2" $ - \n -> B.readInteger (B.pack (show n)) === Just (n, B.empty) + , testRdInt @Int "readInt" + , testRdInt @Int8 "readInt8" + , testRdInt @Int16 "readInt16" + , testRdInt @Int32 "readInt32" + , testRdInt @Int64 "readInt64" + , testRdInt @Word "readWord" + , testRdInt @Word8 "readWord8" + , testRdInt @Word16 "readWord16" + , testRdInt @Word32 "readWord32" + , testRdInt @Word64 "readWord64" + , testRdInt @Integer "readInteger" + , testRdInt @Natural "readNatural" , testProperty "lines" $ \x -> map B.unpack (B.lines x) === lines (B.unpack x) , testProperty "lines \\n" $ once $ @@ -608,11 +659,71 @@ readInt xs = case readInteger xs of | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) otherwise -> Nothing +readWord :: String -> Maybe (Word, String) +readWord xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt8 :: String -> Maybe (Int8, String) +readInt8 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord8 :: String -> Maybe (Word8, String) +readWord8 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt16 :: String -> Maybe (Int16, String) +readInt16 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord16 :: String -> Maybe (Word16, String) +readWord16 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt32 :: String -> Maybe (Int32, String) +readInt32 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord32 :: String -> Maybe (Word32, String) +readWord32 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readInt64 :: String -> Maybe (Int64, String) +readInt64 xs = case readInteger xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + +readWord64 :: String -> Maybe (Word64, String) +readWord64 xs = case readIntegerUnsigned xs of + Just (y, zs) + | y' <- fromInteger y, toInteger y' == y -> Just (y', zs) + otherwise -> Nothing + readInteger :: String -> Maybe (Integer, String) readInteger ('+' : xs) = readIntegerUnsigned xs readInteger ('-' : xs) = fmap (first negate) (readIntegerUnsigned xs) readInteger xs = readIntegerUnsigned xs +readNatural :: String -> Maybe (Natural, String) +readNatural xs = case readIntegerUnsigned xs of + Just (y, zs) + | y >= 0 -> Just (fromIntegral @Integer @Natural y, zs) + _ -> Nothing + readIntegerUnsigned :: String -> Maybe (Integer, String) readIntegerUnsigned xs = case readMaybe ys of Just y -> Just (y, zs)