From a70c46a740e5e98903603907a701b4a40faf9916 Mon Sep 17 00:00:00 2001 From: Jappie Klooster Date: Wed, 17 Aug 2022 15:47:26 +0200 Subject: [PATCH 1/8] Adds a conduit interface. 1. I want to compare it for speed against the callback interface 2. This allows you to close a conduit early. --- src/Codec/Xlsx/Parser/Stream.hs | 80 +++++++++++++++++++++++++++------ 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/src/Codec/Xlsx/Parser/Stream.hs b/src/Codec/Xlsx/Parser/Stream.hs index 4ee5cc7..27ddab2 100644 --- a/src/Codec/Xlsx/Parser/Stream.hs +++ b/src/Codec/Xlsx/Parser/Stream.hs @@ -43,7 +43,9 @@ module Codec.Xlsx.Parser.Stream , wiSheets , getWorkbookInfo , CellRow + -- * using a sheet , readSheet + , getSheetConduit , countRowsInSheet , collectItems -- ** Index @@ -74,6 +76,7 @@ import Codec.Xlsx.Types.Internal.Relationships (Relationship (..), Relationships (..)) import Conduit (PrimMonad, (.|)) import qualified Conduit as C +import qualified Data.Conduit.Combinators as CC import qualified Data.Vector as V #ifdef USE_MICROLENS import Lens.Micro @@ -85,6 +88,7 @@ import Lens.Micro.TH import Control.Lens #endif import Codec.Xlsx.Parser.Internal +import Control.Exception(throwIO) import Control.Monad.Catch import Control.Monad.Except import Control.Monad.Reader @@ -109,6 +113,7 @@ import Data.XML.Types import GHC.Generics import Control.DeepSeq import Codec.Xlsx.Parser.Internal.Memoize +import qualified Data.ByteString.Lazy as BL import qualified Codec.Xlsx.Parser.Stream.HexpatInternal as HexpatInternal import Control.Monad.Base @@ -281,7 +286,7 @@ parseSharedStringss = do else do let state0 = initialSharedStrings byteSrc <- Zip.getEntrySource sharedStrsSel - st <- liftIO $ runExpat state0 byteSrc $ \evs -> forM_ evs $ \ev -> do + st <- liftIO $ runCallbackExpat state0 byteSrc $ \evs -> forM_ evs $ \ev -> do mTxt <- parseSharedStrings ev for_ mTxt $ \txt -> ss_list %= (`DL.snoc` txt) @@ -295,7 +300,7 @@ readWorkbookInfo :: Zip.ZipArchive WorkbookInfo readWorkbookInfo = do sel <- Zip.mkEntrySelector "xl/workbook.xml" src <- Zip.getEntrySource sel - sheets <- liftIO $ runExpat [] src $ \evs -> forM_ evs $ \case + sheets <- liftIO $ runCallbackExpat [] src $ \evs -> forM_ evs $ \case StartElement ("sheet" :: ByteString) attrs -> do nm <- lookupBy "name" attrs sheetId <- lookupBy "sheetId" attrs @@ -318,7 +323,7 @@ readWorkbookRelationships :: Zip.ZipArchive Relationships readWorkbookRelationships = do sel <- Zip.mkEntrySelector "xl/_rels/workbook.xml.rels" src <- Zip.getEntrySource sel - liftIO $ fmap Relationships $ runExpat mempty src $ \evs -> forM_ evs $ \case + liftIO $ fmap Relationships $ runCallbackExpat mempty src $ \evs -> forM_ evs $ \case StartElement ("Relationship" :: ByteString) attrs -> do rId <- lookupBy "Id" attrs rTarget <- lookupBy "Target" attrs @@ -378,14 +383,43 @@ getSheetXmlSource sheetId = do Just <$> liftZip (Zip.getEntrySource sheetSel) _ -> pure Nothing -{-# SCC runExpat #-} -runExpat :: forall state tag text. +getSheetConduit :: (MonadIO m, PrimMonad m, MonadThrow m, C.MonadResource m) + => SheetIndex -> + XlsxM (Maybe (ConduitT () SheetItem m ())) +getSheetConduit (MkSheetIndex sheetId) = do + msource <- getSheetXmlSource sheetId + initState <- makeInitialSheetState (MkSheetIndex sheetId) + pure $ msource <&> \source -> + C.evalStateC initState $ source + .| expatConduit + .| saxRowConduit + .| CC.map (MkSheetItem sheetId) + +expatConduit :: + (GenericXMLString tag, GenericXMLString text, Monad m) => + ConduitT ByteString (SAXEvent tag text) m () +expatConduit = + CC.concatMap (Hexpat.parse (ParseOptions Nothing Nothing) . BL.fromStrict) + +saxRowConduit :: + (MonadIO m, HasSheetState m) => + ConduitT (SAXEvent ByteString Text) Row m () +saxRowConduit = + CC.concatMapM $ \sax -> do + row <- saxToRow sax + case row of + RowError err -> liftIO $ throwIO err -- crash + RowInProgress -> pure $ Nothing -- filter + RowCompleted completed -> pure $ Just completed -- result + +{-# SCC runCallbackExpat #-} +runCallbackExpat :: forall state tag text. (GenericXMLString tag, GenericXMLString text) => state -> ConduitT () ByteString (C.ResourceT IO) () -> ([SAXEvent tag text] -> StateT state IO ()) -> IO state -runExpat initialState byteSource handler = do +runCallbackExpat initialState byteSource handler = do -- Set up state ref <- newIORef initialState -- Set up parser and callbacks @@ -416,19 +450,32 @@ runExpatForSheet :: (SheetItem -> IO ()) -> XlsxM () runExpatForSheet initState byteSource inner = - void $ liftIO $ runExpat initState byteSource handler + void $ liftIO $ runCallbackExpat initState byteSource handler where - sheetName = _ps_sheet_index initState + sheetIndex = _ps_sheet_index initState handler evs = forM_ evs $ \ev -> do + si <- saxToRow ev + liftIO $ case si of + RowError err -> throwIO err + RowInProgress -> pure () + RowCompleted completed -> inner $ MkSheetItem sheetIndex $ completed + +data SaxToRowResult = RowError SheetErrors -- ^ something went wrong + | RowInProgress -- ^ hasn't finished a row + | RowCompleted Row + +saxToRow :: + (HasSheetState m) => + SAXEvent ByteString Text -> m SaxToRowResult +saxToRow ev = do parseRes <- runExceptT $ matchHexpatEvent ev case parseRes of - Left err -> throwM err + Left err -> pure $ RowError err Right (Just cellRow) | not (IntMap.null cellRow) -> do rowNum <- use ps_cell_row_index - liftIO $ inner $ MkSheetItem sheetName $ MkRow rowNum cellRow - _ -> pure () - + pure $ RowCompleted $ MkRow rowNum cellRow + _ -> pure RowInProgress -- | this will collect the sheetitems in a list. -- useful for cases were memory is of no concern but a sheetitem -- type in a list is needed. @@ -483,6 +530,13 @@ readSheet (MkSheetIndex sheetId) inner = do runExpatForSheet sheetState0 sourceSheetXml inner pure True +makeInitialSheetState :: SheetIndex -> XlsxM SheetState +makeInitialSheetState (MkSheetIndex sheetId) = do + sharedStrs <- getOrParseSharedStringss + pure $ initialSheetState + & ps_shared_strings .~ sharedStrs + & ps_sheet_index .~ sheetId + -- | Returns number of rows in the given sheet (identified by the -- sheet's ID, AKA the sheetId attribute, AKA 'sheetInfoSheetId'), or Nothing -- if the sheet does not exist. Does not perform a full parse of the @@ -493,7 +547,7 @@ countRowsInSheet (MkSheetIndex sheetId) = do mSrc :: Maybe (ConduitT () ByteString (C.ResourceT IO) ()) <- getSheetXmlSource sheetId for mSrc $ \sourceSheetXml -> do - liftIO $ runExpat @Int @ByteString @ByteString 0 sourceSheetXml $ \evs -> + liftIO $ runCallbackExpat @Int @ByteString @ByteString 0 sourceSheetXml $ \evs -> forM_ evs $ \case StartElement "row" _ -> modify' (+1) _ -> pure () From 9f713a479deb28dc5832aa6079b12863ad3f723b Mon Sep 17 00:00:00 2001 From: Jappie Klooster Date: Wed, 17 Aug 2022 15:50:27 +0200 Subject: [PATCH 2/8] use state creation function --- src/Codec/Xlsx/Parser/Stream.hs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Codec/Xlsx/Parser/Stream.hs b/src/Codec/Xlsx/Parser/Stream.hs index 27ddab2..b90c150 100644 --- a/src/Codec/Xlsx/Parser/Stream.hs +++ b/src/Codec/Xlsx/Parser/Stream.hs @@ -523,10 +523,7 @@ readSheet (MkSheetIndex sheetId) inner = do case mSrc of Nothing -> pure False Just sourceSheetXml -> do - sharedStrs <- getOrParseSharedStringss - let sheetState0 = initialSheetState - & ps_shared_strings .~ sharedStrs - & ps_sheet_index .~ sheetId + sheetState0 <- makeInitialSheetState (MkSheetIndex sheetId) runExpatForSheet sheetState0 sourceSheetXml inner pure True From c3fa989bf34a5b2dcd427bc00224b4c7ed96bb80 Mon Sep 17 00:00:00 2001 From: Jappie Klooster Date: Mon, 22 Aug 2022 17:30:13 +0200 Subject: [PATCH 3/8] break the conduit interface more. I tried rexpressing the parsing true the approach taken by the callback method, but this made things worse. --- src/Codec/Xlsx/Parser/Stream.hs | 63 +++++++++++++++++++++------------ test/StreamTests.hs | 19 ++++++++++ 2 files changed, 59 insertions(+), 23 deletions(-) diff --git a/src/Codec/Xlsx/Parser/Stream.hs b/src/Codec/Xlsx/Parser/Stream.hs index b90c150..78fa2f3 100644 --- a/src/Codec/Xlsx/Parser/Stream.hs +++ b/src/Codec/Xlsx/Parser/Stream.hs @@ -111,9 +111,10 @@ import qualified Data.Text.Read as Read import Data.Traversable (for) import Data.XML.Types import GHC.Generics +import GHC.Stack import Control.DeepSeq import Codec.Xlsx.Parser.Internal.Memoize -import qualified Data.ByteString.Lazy as BL +import Control.Lens.Extras(is) import qualified Codec.Xlsx.Parser.Stream.HexpatInternal as HexpatInternal import Control.Monad.Base @@ -389,17 +390,23 @@ getSheetConduit :: (MonadIO m, PrimMonad m, MonadThrow m, C.MonadResource m) getSheetConduit (MkSheetIndex sheetId) = do msource <- getSheetXmlSource sheetId initState <- makeInitialSheetState (MkSheetIndex sheetId) - pure $ msource <&> \source -> - C.evalStateC initState $ source - .| expatConduit - .| saxRowConduit + (parseChunk, _getLoc) <- liftIO $ Hexpat.hexpatNewParser Nothing Nothing False + pure $ msource <&> \source -> source + .| expatConduit parseChunk + .| C.evalStateC initState saxRowConduit .| CC.map (MkSheetItem sheetId) expatConduit :: - (GenericXMLString tag, GenericXMLString text, Monad m) => + forall tag text m . + (GenericXMLString tag, GenericXMLString text, MonadIO m, HasCallStack) => + HParser -> ConduitT ByteString (SAXEvent tag text) m () -expatConduit = - CC.concatMap (Hexpat.parse (ParseOptions Nothing Nothing) . BL.fromStrict) +expatConduit parseChunk = do + mUpstream <- C.await + res <- liftIO $ case mUpstream of + Just upstreamBs -> processChunk @tag @text parseChunk False upstreamBs + Nothing -> processChunk @tag @text parseChunk True BS.empty + traverse_ C.yield res saxRowConduit :: (MonadIO m, HasSheetState m) => @@ -424,26 +431,36 @@ runCallbackExpat initialState byteSource handler = do ref <- newIORef initialState -- Set up parser and callbacks (parseChunk, _getLoc) <- Hexpat.hexpatNewParser Nothing Nothing False - let noExtra _ offset = pure ((), offset) - {-# SCC processChunk #-} - {-# INLINE processChunk #-} - processChunk isFinalChunk chunk = do - (buf, len, mError) <- parseChunk chunk isFinalChunk - saxen <- HexpatInternal.parseBuf buf len noExtra - case mError of - Just err -> error $ "expat error: " <> show err - Nothing -> do - state0 <- liftIO $ readIORef ref - state1 <- - {-# SCC "runExpat_runStateT_call" #-} - execStateT (handler $ map fst saxen) state0 + let callHandlerState hexpat = do + -- you'd say you could factor this out completly + -- dealing with state shouldn't be part at all of this function + state0 <- readIORef ref + state1 <- execStateT (handler hexpat) state0 writeIORef ref state1 + C.runConduitRes $ byteSource .| - C.awaitForever (liftIO . processChunk False) - processChunk True BS.empty + C.awaitForever (\x -> liftIO $ do + callHandlerState =<< processChunk @tag @text parseChunk False x + + ) + callHandlerState =<< processChunk @tag @text parseChunk True BS.empty readIORef ref +{-# SCC processChunk #-} +{-# INLINE processChunk #-} +processChunk :: forall tag text. + (GenericXMLString tag, GenericXMLString text, HasCallStack) => + HParser -> Bool -> ByteString -> IO [SAXEvent tag text] +processChunk parseChunk isFinalChunk chunk = do + (buf, len, mError) <- parseChunk chunk isFinalChunk + saxen <- HexpatInternal.parseBuf buf len noExtra + case mError of + Just err -> error $ "expat error: " <> show err + Nothing -> do + pure $ map fst saxen + where + noExtra _ offset = pure ((), offset) runExpatForSheet :: SheetState -> ConduitT () ByteString (C.ResourceT IO) () -> diff --git a/test/StreamTests.hs b/test/StreamTests.hs index bb8ffc4..b5eb2f8 100644 --- a/test/StreamTests.hs +++ b/test/StreamTests.hs @@ -25,6 +25,8 @@ import Codec.Xlsx import Codec.Xlsx.Parser.Stream import Conduit ((.|)) import qualified Conduit as C +import qualified Data.Conduit.Combinators as CC +import Control.Exception (bracket) import Control.Lens hiding (indexed) import Data.Set.Lens import qualified Data.ByteString.Lazy as LB @@ -67,6 +69,9 @@ tests = testGroup "Reader/Writer" [ testCase "Write as stream, see if memory based implementation can read it" $ readWrite simpleWorkbook , testCase "Write as stream, see if memory based implementation can read it" $ readWrite simpleWorkbookRow + , testCase "Write as stream, using conduit parser (simpleWorkbook)" $ readWriteConduit simpleWorkbook + , testCase "Write as stream, using conduit parser (simpleWorkbookRow)" $ readWriteConduit simpleWorkbookRow + , testCase "Write as stream, using conduit parser (bigWorkbook)" $ readWriteConduit bigWorkbook , testCase "Test a small workbook which has a fullblown sqaure" $ readWrite smallWorkbook , testCase "Test a big workbook as a full square which caused issues with zipstream \ The buffer of zipstream maybe 1kb, this workbook is big enough \ @@ -98,6 +103,20 @@ readWrite input = do Left x -> do throwIO x +readWriteConduit :: Xlsx -> IO () +readWriteConduit input = do + BS.writeFile "testinput.xlsx" (toBs input) + bs <- runXlsxM "testinput.xlsx" $ do + mConduit <- getSheetConduit $ makeIndex 1 + case mConduit of + Nothing -> error "sheet should exist" + Just conduit -> liftIO $ runConduitRes $ void (SW.writeXlsx SW.defaultSettings (conduit .| CC.map (view si_row))) .| C.foldC + + case toXlsxEither $ LB.fromStrict bs of + Right result -> + input @==? result + Left x -> do + throwIO x -- test if the input text is also the result (a property we use for convenience) sharedStringInputSameAsOutput :: Text -> Either String String sharedStringInputSameAsOutput someText = From 42c523703ab5280ffd9d53e0e0c4fa9a2fac15a2 Mon Sep 17 00:00:00 2001 From: Jappie Klooster Date: Wed, 19 Oct 2022 17:08:16 -0400 Subject: [PATCH 4/8] split of contuit in a testgroup --- test/StreamTests.hs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/StreamTests.hs b/test/StreamTests.hs index b5eb2f8..71995b7 100644 --- a/test/StreamTests.hs +++ b/test/StreamTests.hs @@ -69,9 +69,6 @@ tests = testGroup "Reader/Writer" [ testCase "Write as stream, see if memory based implementation can read it" $ readWrite simpleWorkbook , testCase "Write as stream, see if memory based implementation can read it" $ readWrite simpleWorkbookRow - , testCase "Write as stream, using conduit parser (simpleWorkbook)" $ readWriteConduit simpleWorkbook - , testCase "Write as stream, using conduit parser (simpleWorkbookRow)" $ readWriteConduit simpleWorkbookRow - , testCase "Write as stream, using conduit parser (bigWorkbook)" $ readWriteConduit bigWorkbook , testCase "Test a small workbook which has a fullblown sqaure" $ readWrite smallWorkbook , testCase "Test a big workbook as a full square which caused issues with zipstream \ The buffer of zipstream maybe 1kb, this workbook is big enough \ @@ -81,6 +78,12 @@ tests = $ readWrite bigWorkbook -- , testCase "Write as stream, see if memory based implementation can read it" $ readWrite testXlsx -- TODO forall SheetItem write that can be read + + , testGroup "Conduit" [ + testCase "Write as stream, using conduit parser (simpleWorkbook)" $ readWriteConduit simpleWorkbook + , testCase "Write as stream, using conduit parser (simpleWorkbookRow)" $ readWriteConduit simpleWorkbookRow + , testCase "Write as stream, using conduit parser (bigWorkbook)" $ readWriteConduit bigWorkbook + ] ], testGroup "Reader/inline strings" From 65554f737ea5c2d4ca821ded5b8d2c8c9a9b57e4 Mon Sep 17 00:00:00 2001 From: Jappie Klooster Date: Wed, 19 Oct 2022 19:56:55 -0400 Subject: [PATCH 5/8] Make it function --- src/Codec/Xlsx/Parser/Stream.hs | 27 +++++++++++++++++++-------- test/StreamTests.hs | 3 ++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/Codec/Xlsx/Parser/Stream.hs b/src/Codec/Xlsx/Parser/Stream.hs index 78fa2f3..c82e46b 100644 --- a/src/Codec/Xlsx/Parser/Stream.hs +++ b/src/Codec/Xlsx/Parser/Stream.hs @@ -105,6 +105,7 @@ import qualified Data.IntMap.Strict as IntMap import qualified Data.Map.Strict as M import Data.Text (Text) import qualified Data.Text as T +import qualified Data.Text.Encoding as T import qualified Data.Text.Lazy as LT import qualified Data.Text.Lazy.Builder as TB import qualified Data.Text.Read as Read @@ -391,9 +392,10 @@ getSheetConduit (MkSheetIndex sheetId) = do msource <- getSheetXmlSource sheetId initState <- makeInitialSheetState (MkSheetIndex sheetId) (parseChunk, _getLoc) <- liftIO $ Hexpat.hexpatNewParser Nothing Nothing False + stateRef <- liftIO $ newIORef initState pure $ msource <&> \source -> source .| expatConduit parseChunk - .| C.evalStateC initState saxRowConduit + .| saxRowConduit stateRef .| CC.map (MkSheetItem sheetId) expatConduit :: @@ -403,17 +405,25 @@ expatConduit :: ConduitT ByteString (SAXEvent tag text) m () expatConduit parseChunk = do mUpstream <- C.await - res <- liftIO $ case mUpstream of - Just upstreamBs -> processChunk @tag @text parseChunk False upstreamBs - Nothing -> processChunk @tag @text parseChunk True BS.empty - traverse_ C.yield res + case mUpstream of + Just upstreamBs -> do + liftIO $ putStrLn "expatConduit" + liftIO $ putStrLn $ T.unpack $ T.decodeUtf8 upstreamBs + traverse_ C.yield =<< liftIO (processChunk @tag @text parseChunk False upstreamBs) + expatConduit parseChunk + Nothing -> do + liftIO $ putStrLn "expatConduit nothing" + traverse_ C.yield =<< liftIO (processChunk @tag @text parseChunk True BS.empty) saxRowConduit :: - (MonadIO m, HasSheetState m) => + (MonadIO m) => + IORef SheetState -> ConduitT (SAXEvent ByteString Text) Row m () -saxRowConduit = +saxRowConduit sheetStateRef = CC.concatMapM $ \sax -> do - row <- saxToRow sax + curState <- liftIO $ readIORef sheetStateRef + (row, nextState) <- flip runStateT curState $ saxToRow sax + liftIO $ writeIORef sheetStateRef nextState case row of RowError err -> liftIO $ throwIO err -- crash RowInProgress -> pure $ Nothing -- filter @@ -441,6 +451,7 @@ runCallbackExpat initialState byteSource handler = do C.runConduitRes $ byteSource .| C.awaitForever (\x -> liftIO $ do + putStrLn $ T.unpack $ T.decodeUtf8 x callHandlerState =<< processChunk @tag @text parseChunk False x ) diff --git a/test/StreamTests.hs b/test/StreamTests.hs index 71995b7..ad235b2 100644 --- a/test/StreamTests.hs +++ b/test/StreamTests.hs @@ -113,7 +113,8 @@ readWriteConduit input = do mConduit <- getSheetConduit $ makeIndex 1 case mConduit of Nothing -> error "sheet should exist" - Just conduit -> liftIO $ runConduitRes $ void (SW.writeXlsx SW.defaultSettings (conduit .| CC.map (view si_row))) .| C.foldC + -- TODO make this work with SW.writeXlsx, we the conduit variant can't do replays right now + Just conduit -> liftIO $ runConduitRes $ void (SW.writeXlsxWithSharedStrings SW.defaultSettings mempty (conduit .| CC.map (view si_row))) .| C.foldC case toXlsxEither $ LB.fromStrict bs of Right result -> From ff48974ea408b28765a87df5758d0754bea2ea2b Mon Sep 17 00:00:00 2001 From: Jappie Klooster Date: Wed, 19 Oct 2022 20:07:27 -0400 Subject: [PATCH 6/8] make the conduit replayable. add the tests --- src/Codec/Xlsx/Parser/Stream.hs | 15 +++++---------- test/StreamTests.hs | 23 +++++++++++++++++++++-- 2 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/Codec/Xlsx/Parser/Stream.hs b/src/Codec/Xlsx/Parser/Stream.hs index c82e46b..6e44508 100644 --- a/src/Codec/Xlsx/Parser/Stream.hs +++ b/src/Codec/Xlsx/Parser/Stream.hs @@ -105,7 +105,6 @@ import qualified Data.IntMap.Strict as IntMap import qualified Data.Map.Strict as M import Data.Text (Text) import qualified Data.Text as T -import qualified Data.Text.Encoding as T import qualified Data.Text.Lazy as LT import qualified Data.Text.Lazy.Builder as TB import qualified Data.Text.Read as Read @@ -115,7 +114,6 @@ import GHC.Generics import GHC.Stack import Control.DeepSeq import Codec.Xlsx.Parser.Internal.Memoize -import Control.Lens.Extras(is) import qualified Codec.Xlsx.Parser.Stream.HexpatInternal as HexpatInternal import Control.Monad.Base @@ -391,9 +389,10 @@ getSheetConduit :: (MonadIO m, PrimMonad m, MonadThrow m, C.MonadResource m) getSheetConduit (MkSheetIndex sheetId) = do msource <- getSheetXmlSource sheetId initState <- makeInitialSheetState (MkSheetIndex sheetId) - (parseChunk, _getLoc) <- liftIO $ Hexpat.hexpatNewParser Nothing Nothing False - stateRef <- liftIO $ newIORef initState - pure $ msource <&> \source -> source + pure $ msource <&> \source -> do + (parseChunk, _getLoc) <- liftIO $ Hexpat.hexpatNewParser Nothing Nothing False + stateRef <- liftIO $ newIORef initState + source .| expatConduit parseChunk .| saxRowConduit stateRef .| CC.map (MkSheetItem sheetId) @@ -407,12 +406,9 @@ expatConduit parseChunk = do mUpstream <- C.await case mUpstream of Just upstreamBs -> do - liftIO $ putStrLn "expatConduit" - liftIO $ putStrLn $ T.unpack $ T.decodeUtf8 upstreamBs traverse_ C.yield =<< liftIO (processChunk @tag @text parseChunk False upstreamBs) expatConduit parseChunk Nothing -> do - liftIO $ putStrLn "expatConduit nothing" traverse_ C.yield =<< liftIO (processChunk @tag @text parseChunk True BS.empty) saxRowConduit :: @@ -450,8 +446,7 @@ runCallbackExpat initialState byteSource handler = do C.runConduitRes $ byteSource .| - C.awaitForever (\x -> liftIO $ do - putStrLn $ T.unpack $ T.decodeUtf8 x + C.awaitForever (\x -> liftIO $ callHandlerState =<< processChunk @tag @text parseChunk False x ) diff --git a/test/StreamTests.hs b/test/StreamTests.hs index ad235b2..124dd4c 100644 --- a/test/StreamTests.hs +++ b/test/StreamTests.hs @@ -83,7 +83,12 @@ tests = testCase "Write as stream, using conduit parser (simpleWorkbook)" $ readWriteConduit simpleWorkbook , testCase "Write as stream, using conduit parser (simpleWorkbookRow)" $ readWriteConduit simpleWorkbookRow , testCase "Write as stream, using conduit parser (bigWorkbook)" $ readWriteConduit bigWorkbook - ] + , testGroup "No sst" [ + testCase "Write as stream, using conduit parser (simpleWorkbook)" $ readWriteConduitNoSst simpleWorkbook + , testCase "Write as stream, using conduit parser (simpleWorkbookRow)" $ readWriteConduitNoSst simpleWorkbookRow + , testCase "Write as stream, using conduit parser (bigWorkbook)" $ readWriteConduitNoSst bigWorkbook + ] + ] ], testGroup "Reader/inline strings" @@ -113,7 +118,21 @@ readWriteConduit input = do mConduit <- getSheetConduit $ makeIndex 1 case mConduit of Nothing -> error "sheet should exist" - -- TODO make this work with SW.writeXlsx, we the conduit variant can't do replays right now + Just conduit -> liftIO $ runConduitRes $ void (SW.writeXlsx SW.defaultSettings (conduit .| CC.map (view si_row))) .| C.foldC + + case toXlsxEither $ LB.fromStrict bs of + Right result -> + input @==? result + Left x -> do + throwIO x + +readWriteConduitNoSst :: Xlsx -> IO () +readWriteConduitNoSst input = do + BS.writeFile "testinput.xlsx" (toBs input) + bs <- runXlsxM "testinput.xlsx" $ do + mConduit <- getSheetConduit $ makeIndex 1 + case mConduit of + Nothing -> error "sheet should exist" Just conduit -> liftIO $ runConduitRes $ void (SW.writeXlsxWithSharedStrings SW.defaultSettings mempty (conduit .| CC.map (view si_row))) .| C.foldC case toXlsxEither $ LB.fromStrict bs of From 6c88335077d47ef1cfab6eaa794cbccb03924560 Mon Sep 17 00:00:00 2001 From: Jappie Klooster Date: Wed, 19 Oct 2022 20:10:19 -0400 Subject: [PATCH 7/8] add comment on why this test --- test/StreamTests.hs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/StreamTests.hs b/test/StreamTests.hs index 124dd4c..bcea367 100644 --- a/test/StreamTests.hs +++ b/test/StreamTests.hs @@ -126,6 +126,12 @@ readWriteConduit input = do Left x -> do throwIO x +-- No sst behaves differently frmo the normal writexlsx because +-- the sst table isn't first constructed. +-- this results in a single pass instead of a double pass. +-- it turns out that in certain cases this test would pass +-- but the writeXlsx wouldn't, which indicates brittleness within +-- the statefull hexpat parser. readWriteConduitNoSst :: Xlsx -> IO () readWriteConduitNoSst input = do BS.writeFile "testinput.xlsx" (toBs input) From 54c03a04d47e2e1a4aeb0af101491683d02c0ba3 Mon Sep 17 00:00:00 2001 From: Mauricio Fierro Date: Wed, 15 Feb 2023 12:52:30 -0500 Subject: [PATCH 8/8] Add benchmark for partial data reading The benchmark group covers reading a certain number of rows using the readSheet and getSheetConduit functions. --- benchmarks/Main.hs | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/benchmarks/Main.hs b/benchmarks/Main.hs index 72c9cee..f0e0d53 100644 --- a/benchmarks/Main.hs +++ b/benchmarks/Main.hs @@ -6,12 +6,16 @@ import Codec.Xlsx.Parser.Stream import Codec.Xlsx.Writer.Stream import Control.DeepSeq import Control.Lens -import Control.Monad (void) +import Control.Monad (unless, void) +import Control.Monad.IO.Class (liftIO) import Criterion.Main import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as LB +import Data.Conduit ((.|)) import qualified Data.Conduit as C import qualified Data.Conduit.Combinators as C +import Data.Foldable (for_) +import Data.IORef import Data.Maybe main :: IO () @@ -33,16 +37,30 @@ main = do , bench "with stream (counting)" $ nfIO $ runXlsxM filename $ countRowsInSheet idx , bench "with stream (reading)" $ nfIO $ runXlsxM filename $ readSheet idx (pure . rwhnf) ] + , bgroup + "read partial data (100 rows)" + [ bench "with readSheet" $ nfIO $ runXlsxM filename $ do + rowsRef <- liftIO $ newIORef [] + readSheet idx $ \ sheetItem -> do + existing <- readIORef rowsRef + unless (length existing > 100) $ + writeIORef rowsRef $ sheetItem:existing + liftIO $ readIORef rowsRef + , bench "with stream (conduit)" $ nfIO $ runXlsxM filename $ do + mConduit <- getSheetConduit idx + for_ mConduit $ \conduit -> + liftIO $ C.runConduitRes $ conduit .| C.take 100 .| C.sinkList + ] , bgroup "writeFile" [ bench "with xlsx" $ nf (fromXlsx 0) parsed , bench "with stream (no sst)" $ nfIO $ C.runConduit $ void (writeXlsxWithSharedStrings defaultSettings mempty $ C.yieldMany $ view si_row <$> items) - C..| C.fold + .| C.fold , bench "with stream (sst)" $ nfIO $ C.runConduit $ void (writeXlsx defaultSettings $ C.yieldMany $ view si_row <$> items) - C..| C.fold + .| C.fold ] ]