Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse base64-encoded data URIs more efficiently #10434

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandoc.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,7 @@ library
Text.Pandoc.Class.Sandbox,
Text.Pandoc.Filter.Environment,
Text.Pandoc.Filter.JSON,
Text.Pandoc.Parsing.Base64,
Text.Pandoc.Parsing.Capabilities,
Text.Pandoc.Parsing.Citations,
Text.Pandoc.Parsing.General,
Expand Down
4 changes: 3 additions & 1 deletion src/Text/Pandoc/Parsing.hs
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ module Text.Pandoc.Parsing ( module Text.Pandoc.Sources,
Column,
ParseError,
errorMessages,
messageString
messageString,
parseBase64String
)
where

Expand Down Expand Up @@ -341,3 +342,4 @@ import Text.Pandoc.Parsing.State
SubstTable )
import Text.Pandoc.Parsing.Future
( askF, asksF, returnF, runF, Future(..) )
import Text.Pandoc.Parsing.Base64 (parseBase64String)
39 changes: 39 additions & 0 deletions src/Text/Pandoc/Parsing/Base64.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Parsing.Base64
Copyright : © 2024 Evan Silberman
License : GPL-2.0-or-later
Maintainer : John MacFarlane <[email protected]>

Parse large base64 strings efficiently within Pandoc's
normal parsing environment
-}

module Text.Pandoc.Parsing.Base64
( parseBase64String )

where

import Data.Text as T
import Data.Attoparsec.Text as A
import Text.Parsec (ParsecT, getInput, setInput, incSourceColumn)
import Text.Pandoc.Sources
import Control.Monad (mzero)

parseBase64String :: Monad m => ParsecT Sources u m Text
parseBase64String = do
Sources ((pos, txt):rest) <- getInput
let r = A.parse pBase64 txt
case r of
Done remaining consumed -> do
let pos' = incSourceColumn pos (T.length consumed)
setInput $ Sources ((pos', remaining):rest)
return consumed
_ -> mzero

pBase64 :: A.Parser Text
pBase64 = do
most <- A.takeWhile1 (A.inClass "A-Za-z0-9+/")
rest <- A.takeWhile (== '=')
return $ most <> rest
Comment on lines +25 to +39
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two thoughts on this:

  1. Is attoparsec really necessary? Why not just Data.Text.takeWhile?
  2. My experience is that parsers like this, which just manipulate the input directly using getInput and setInput, are problematic in parsec because parsec doesn't realize that input has been consumed. I've had to use a regular parsec parser somewhere in there to make it realize this. One option is just something like count charsConsumed anyChar, and then you don't need to compute the end position manually...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. In this spot I can possibly just borrow the fast inClass function from attoparsec and use it with regular text takeWhile, will have to fiddle with it.
  2. will investigate, I took it for granted that I would make the parsec state happy by fiddling with the input as seen here but that was not based on deep understanding or rigorous analysis.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably fine to use attoparsec, but there might be a slight speedup if you can avoid it.

On 2: you could try putting this parser under many and see if parsec complains.

14 changes: 8 additions & 6 deletions src/Text/Pandoc/Readers/HTML.hs
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (
addMetaField, extractSpaces, htmlSpanLikeElements, renderTags',
safeRead, tshow, formatCode)
import Text.Pandoc.URI (escapeURI)
import Text.Pandoc.URI (escapeURI, isBase64DataURI)
import Text.Pandoc.Walk
import Text.TeXMath (readMathML, writeTeX)
import qualified Data.Sequence as Seq
Expand Down Expand Up @@ -1219,8 +1219,10 @@ htmlTag f = try $ do

-- | Adjusts a url according to the document's base URL.
canonicalizeUrl :: PandocMonad m => Text -> TagParser m Text
canonicalizeUrl url = do
mbBaseHref <- baseHref <$> getState
return $ case (parseURIReference (T.unpack url), mbBaseHref) of
(Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url
canonicalizeUrl url
| isBase64DataURI url = return url
| otherwise = do
mbBaseHref <- baseHref <$> getState
return $ case (parseURIReference (T.unpack url), mbBaseHref) of
(Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs)
_ -> url
26 changes: 24 additions & 2 deletions src/Text/Pandoc/Readers/Markdown.hs
Original file line number Diff line number Diff line change
Expand Up @@ -1822,13 +1822,35 @@ parenthesizedChars = do
result <- charsInBalanced '(' ')' litChar
return $ "(" <> result <> ")"

pBase64DataURI :: PandocMonad m => MarkdownParser m Text
pBase64DataURI = mconcat <$> sequence
[ (textStr "data:")
, (T.singleton <$> (letter <|> digit))
, restrictedName
, (T.singleton <$> char '/')
, restrictedName
, textStr ";" <* trace "cool"
, (mconcat <$> many mediaParam)
, textStr "base64," <* trace "fine"
, parseBase64String
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if we replace l.1835 with

   many1Char (satisfy (A.inClass "A-Za-z0-9+/"))
,  manyChar (char '=')

and get rid of the special T.P.Parsing.Base64 module. What is the impact on performance on the sorts of files that were problematic before? I'd like to try the simplest possible thing before worrying about possible optimizations.

]
where
restrictedName = manyChar (alphaNum <|> oneOf "!#$&^_.+-")
mediaParam = mconcat <$> sequence
[ notFollowedBy (textStr "base64,") *> mempty -- XXX ???
, restrictedName
, textStr "="
, manyChar (noneOf ";")
, textStr ";"
]

-- source for a link, with optional title
source :: PandocMonad m => MarkdownParser m (Text, Text)
source = do
char '('
skipSpaces
let urlChunk =
try parenthesizedChars
let urlChunk = try pBase64DataURI
<|> try parenthesizedChars
<|> (notFollowedBy (oneOf " )") >> litChar)
<|> try (many1Char spaceChar <* notFollowedBy (oneOf "\"')"))
let sourceURL = T.unwords . T.words . T.concat <$> many urlChunk
Expand Down
40 changes: 36 additions & 4 deletions src/Text/Pandoc/URI.hs
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,26 @@ module Text.Pandoc.URI ( urlEncode
, isURI
, schemes
, uriPathToPath
, isBase64DataURI
) where
import qualified Network.HTTP.Types as HTTP
import qualified Text.Pandoc.UTF8 as UTF8
import qualified Data.Text as T
import qualified Data.Attoparsec.Text as A
import qualified Data.Set as Set
import Data.Char (isSpace, isAscii)
import Data.Either (isRight)
import Control.Applicative ((<|>))
import Network.URI (URI (uriScheme), parseURI, escapeURIString)

urlEncode :: T.Text -> T.Text
urlEncode = UTF8.toText . HTTP.urlEncode True . UTF8.fromText

-- | Escape whitespace and some punctuation characters in URI.
escapeURI :: T.Text -> T.Text
escapeURI = T.pack . escapeURIString (not . needsEscaping) . T.unpack
escapeURI t
| isBase64DataURI t = t
| otherwise = (T.pack . escapeURIString (not . needsEscaping) . T.unpack) t
where needsEscaping c = isSpace c || T.any (== c) "<>|\"{}[]^`"

--
Expand Down Expand Up @@ -87,12 +93,38 @@ schemes = Set.fromList
, "doi", "gemini", "isbn", "javascript", "pmid"
]

isBase64DataURI :: T.Text -> Bool
isBase64DataURI =
isRight . A.parseOnly pBase64DataURI

pBase64DataURI :: A.Parser T.Text
pBase64DataURI = mconcat <$> sequence
[ (A.string "data:")
, (T.singleton <$> (A.letter <|> A.digit))
, restrictedName
, (T.singleton <$> A.char '/')
, restrictedName
, (mconcat <$> A.many' mediaParam)
, A.string ";base64,"
, A.takeWhile1 (A.inClass "A-Za-z0-9+/")
, A.takeWhile (== '=')
]
where
restrictedName = A.takeWhile1 (A.inClass "0-9a-zA-Z!#$&^_.+-")
mediaParam = mconcat <$> sequence
[ (T.singleton <$> A.char ';')
, restrictedName
, (T.singleton <$> A.char '=')
, A.takeWhile (/= ';')
]

-- | Check if the string is a valid URL with a IANA or frequently used but
-- unofficial scheme (see @schemes@).
isURI :: T.Text -> Bool
isURI =
-- we URI-escape non-ASCII characters because otherwise parseURI will choke:
maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack
isURI t =
isBase64DataURI t ||
-- we URI-escape non-ASCII characters because otherwise parseURI will choke:
(maybe False hasKnownScheme . parseURI . escapeURIString isAscii . T.unpack) t
where
hasKnownScheme = (`Set.member` schemes) . T.toLower .
T.filter (/= ':') . T.pack . uriScheme
Expand Down
Loading