From c4e44bdc5d5017f2bdc8be61d9876a5ddbae1ba5 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Sun, 25 Oct 2020 20:29:37 +0100 Subject: [PATCH 1/4] Optimize isSpace functions --- Data/ByteString/Internal.hs | 18 +++++++++--------- bench/BenchAll.hs | 7 +++++++ 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs index b70829078..4f8be6383 100644 --- a/Data/ByteString/Internal.hs +++ b/Data/ByteString/Internal.hs @@ -671,16 +671,16 @@ c2w = fromIntegral . ord {-# INLINE c2w #-} -- | Selects words corresponding to white-space characters in the Latin-1 range --- ordered by frequency. isSpaceWord8 :: Word8 -> Bool -isSpaceWord8 w = - w == 0x20 || - w == 0x0A || -- LF, \n - w == 0x09 || -- HT, \t - w == 0x0C || -- FF, \f - w == 0x0D || -- CR, \r - w == 0x0B || -- VT, \v - w == 0xA0 -- spotted by QC.. +isSpaceWord8 w8 = + -- Avoid the cost of narrowing arithmetic results to Word8, + -- the conversion from Word8 to Word is free. + let w :: Word + !w = fromIntegral w8 + in w - 0x21 > 0x7e -- not [x21..0x9f] + && ( w == 0x20 -- SP + || w - 0x09 < 5 -- HT, NL, VT, FF, CR + || w == 0xa0 ) -- NBSP {-# INLINE isSpaceWord8 #-} -- | Selects white-space characters in the Latin-1 range diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index 203491dc5..a573d76cb 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -101,6 +101,9 @@ byteStringChunksData = map (S.pack . replicate (4 ) . fromIntegral) intData oldByteStringChunksData :: [OldS.ByteString] oldByteStringChunksData = map (OldS.pack . replicate (4 ) . fromIntegral) intData +{-# NOINLINE loremIpsum #-} +loremIpsum :: S.ByteString +loremIpsum = S8.pack "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\nSed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?\n" -- benchmark wrappers --------------------- @@ -397,6 +400,10 @@ main = do ] ] , bgroup "sort" $ map (\s -> bench (S8.unpack s) $ nf S.sort s) sortInputs + , bgroup "words" + [ bench "lorem ipsum" $ nf S8.words loremIpsum + , bench "one huge word" $ nf S8.words byteStringData + ] , bgroup "folds" [ bgroup "foldl'" $ map (\s -> bench (show $ S.length s) $ nf (S.foldl' (\acc x -> acc + fromIntegral x) (0 :: Int)) s) foldInputs From 9127d6e1d018e1f3a1df0f7d4a8ef1fa0cbe3166 Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Mon, 26 Oct 2020 21:46:25 +0100 Subject: [PATCH 2/4] Additional quick filter in isSpaceWord8 --- Data/ByteString/Internal.hs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs index 4f8be6383..94e7c0d60 100644 --- a/Data/ByteString/Internal.hs +++ b/Data/ByteString/Internal.hs @@ -126,6 +126,7 @@ import Data.String (IsString(..)) import Control.Exception (assert) +import Data.Bits ((.&.)) import Data.Char (ord) import Data.Word (Word8) @@ -677,10 +678,11 @@ isSpaceWord8 w8 = -- the conversion from Word8 to Word is free. let w :: Word !w = fromIntegral w8 - in w - 0x21 > 0x7e -- not [x21..0x9f] - && ( w == 0x20 -- SP - || w - 0x09 < 5 -- HT, NL, VT, FF, CR - || w == 0xa0 ) -- NBSP + in w .&. 0x50 == 0 -- Quick non-whitespace filter + && w - 0x21 > 0x7e -- Second non-whitespace filter + && ( w == 0x20 -- SP + || w == 0xa0 -- NBSP + || w - 0x09 < 5) -- HT, NL, VT, FF, CR {-# INLINE isSpaceWord8 #-} -- | Selects white-space characters in the Latin-1 range From c1024c7be659b8980907d6b7440e1ed44e91fb5f Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Tue, 27 Oct 2020 19:25:35 +0100 Subject: [PATCH 3/4] Split lorem ipsum test string into more lines --- Data/ByteString/Internal.hs | 9 +-------- bench/BenchAll.hs | 9 ++++++++- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs index 94e7c0d60..1d8e58b86 100644 --- a/Data/ByteString/Internal.hs +++ b/Data/ByteString/Internal.hs @@ -687,14 +687,7 @@ isSpaceWord8 w8 = -- | Selects white-space characters in the Latin-1 range isSpaceChar8 :: Char -> Bool -isSpaceChar8 c = - c == ' ' || - c == '\t' || - c == '\n' || - c == '\r' || - c == '\f' || - c == '\v' || - c == '\xa0' +isSpaceChar8 = isSpaceWord8 . c2w {-# INLINE isSpaceChar8 #-} overflowError :: String -> a diff --git a/bench/BenchAll.hs b/bench/BenchAll.hs index a573d76cb..18741894b 100644 --- a/bench/BenchAll.hs +++ b/bench/BenchAll.hs @@ -103,7 +103,14 @@ oldByteStringChunksData = map (OldS.pack . replicate (4 ) . fromIntegral) intDat {-# NOINLINE loremIpsum #-} loremIpsum :: S.ByteString -loremIpsum = S8.pack "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\nSed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?\n" +loremIpsum = S8.unlines $ map S8.pack + [ " Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor" + , "incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis" + , "nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat." + , "Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu" + , "fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in" + , "culpa qui officia deserunt mollit anim id est laborum." + ] -- benchmark wrappers --------------------- From 8be362729f3d1784b734fc19ca1ac9645110aa9a Mon Sep 17 00:00:00 2001 From: Dmitry Ivanov Date: Wed, 28 Oct 2020 19:33:27 +0100 Subject: [PATCH 4/4] Fix build with GHC<7.10 --- Data/ByteString/Internal.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Data/ByteString/Internal.hs b/Data/ByteString/Internal.hs index 1d8e58b86..432bd1c02 100644 --- a/Data/ByteString/Internal.hs +++ b/Data/ByteString/Internal.hs @@ -128,7 +128,7 @@ import Control.Exception (assert) import Data.Bits ((.&.)) import Data.Char (ord) -import Data.Word (Word8) +import Data.Word (Word8, Word) import Data.Typeable (Typeable) import Data.Data (Data(..), mkNoRepType)