Skip to content

Commit

Permalink
HTML reader: improved table parsing.
Browse files Browse the repository at this point in the history
We now check explicitly for non-1 rowspan or colspan
attributes, and fail when we encounter them. Previously
we checked that each row had the same number of cells,
but that could be true even with rowspans/colspans.
And there are cases where it isn't true in tables that
we can handle fine -- e.g. when a tr element is empty.
So now we just pad rows with empty cells when needed.

Closes #3027.
  • Loading branch information
jgm committed Nov 26, 2016
1 parent 7b4a12a commit 5222572
Showing 1 changed file with 24 additions and 11 deletions.
35 changes: 24 additions & 11 deletions src/Text/Pandoc/Readers/HTML.hs
Original file line number Diff line number Diff line change
Expand Up @@ -435,17 +435,20 @@ pTable = try $ do
rowsLs <- many pTBody
rows' <- pOptInTag "tfoot" $ many pTr
TagClose _ <- pSatisfy (~== TagClose "table")
let rows = (concat rowsLs) ++ rows'
let rows'' = (concat rowsLs) ++ rows'
-- fail on empty table
guard $ not $ null head' && null rows
guard $ not $ null head' && null rows''
let isSinglePlain x = case B.toList x of
[] -> True
[Plain _] -> True
_ -> False
let isSimple = all isSinglePlain $ concat (head':rows)
let cols = length $ if null head' then head rows else head'
-- fail if there are colspans or rowspans
guard $ all (\r -> length r == cols) rows
let isSimple = all isSinglePlain $ concat (head':rows'')
let cols = length $ if null head' then head rows'' else head'
-- add empty cells to short rows
let addEmpties r = case length r - cols of
n | n > 1 -> r ++ replicate n []
| otherwise -> r
let rows = addEmpties rows''
let aligns = replicate cols AlignDefault
let widths = if null widths'
then if isSimple
Expand All @@ -471,10 +474,17 @@ pColgroup = try $ do
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank

noColOrRowSpans :: Tag String -> Bool
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
where isNullOrOne x = case fromAttrib x t of
"" -> True
"1" -> True
_ -> False

pCell :: String -> TagParser [Blocks]
pCell celltype = try $ do
skipMany pBlank
res <- pInTags celltype block
res <- pInTags' celltype noColOrRowSpans block
skipMany pBlank
return [res]

Expand Down Expand Up @@ -695,10 +705,13 @@ pInlinesInTags :: String -> (Inlines -> Inlines)
-> TagParser Inlines
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline

pInTags :: (Monoid a) => String -> TagParser a
-> TagParser a
pInTags tagtype parser = try $ do
pSatisfy (~== TagOpen tagtype [])
pInTags :: (Monoid a) => String -> TagParser a -> TagParser a
pInTags tagtype parser = pInTags' tagtype (const True) parser

pInTags' :: (Monoid a) => String -> (Tag String -> Bool) -> TagParser a
-> TagParser a
pInTags' tagtype tagtest parser = try $ do
pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
mconcat <$> manyTill parser (pCloses tagtype <|> eof)

-- parses p, preceeded by an optional opening tag
Expand Down

0 comments on commit 5222572

Please sign in to comment.