Skip to content

Commit

Permalink
Fix unpaired close tags and self-closing tags
Browse files Browse the repository at this point in the history
google#251 assumed that all tags are closed properly.

This assumption doesn't stand for cases like:
1. Self-closing tags such as `<img>` don't have corresponding close tags.
2. Unpaired close tags are still valid HTML.

This patch supports these cases by assuming all open tags that doesn't
nest correctly or that doesn't close are automatically closed.

This isn't the full HTML "adoption agency algorithm", but it should be
good enough for the needs of BudouX.

Fixes google#355
  • Loading branch information
kojiishi committed Nov 10, 2023
1 parent 0d812b2 commit 0df2fb2
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 3 deletions.
28 changes: 25 additions & 3 deletions budoux/html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,18 @@
SKIP_NODES: typing.Set[str] = set(json.load(f))


class ElementState(object):
"""Represents the state for an element.
Attributes:
tag (str): The tag name.
to_skip (bool): Whether the content should be skipped or not.
"""
def __init__(self, tag: str, to_skip: bool) -> None:
self.tag = tag
self.to_skip = to_skip


class TextContentExtractor(HTMLParser):
"""An HTML parser to extract text content.
Expand Down Expand Up @@ -61,7 +73,7 @@ def __init__(self, chunks: typing.List[str], separator: str):
self.separator = separator
self.to_skip = False
self.scan_index = 0
self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
self.element_stack: queue.LifoQueue[ElementState] = queue.LifoQueue()

def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
attr_pairs = []
Expand All @@ -71,7 +83,7 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
else:
attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
encoded_attrs = ''.join(attr_pairs)
self.element_stack.put(self.to_skip)
self.element_stack.put(ElementState(tag, self.to_skip))
if tag.upper() in SKIP_NODES:
if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
self.scan_index += 1
Expand All @@ -81,7 +93,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:

def handle_endtag(self, tag: str) -> None:
self.output += '</%s>' % (tag)
self.to_skip = self.element_stack.get_nowait()
while not self.element_stack.empty():
state = self.element_stack.get_nowait()
if state.tag == tag:
self.to_skip = state.to_skip
break
# If the close tag doesn't match the open tag, remove it and keep looking.
# This means that close tags close their corresponding open tags.
# e.g., `<span>abc<img>def</span>` or `<p>abc<span>def</p>` are both valid
# HTML as per the HTML spec.
# Note the HTML "adoption agency algorithm" isn't fully supported.
# See https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser

def handle_data(self, data: str) -> None:
for char in data:
Expand Down
32 changes: 32 additions & 0 deletions tests/test_html_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,38 @@ def test_output(self) -> None:
self.assertEqual(resolver.output, expected,
'WBR tags should be inserted as specified by chunks.')

def test_unpaired(self) -> None:
input = '<p>abcdef</p></p>'
expected = '<p>abc<wbr>def</p></p>'
resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
resolver.feed(input)
self.assertEqual(resolver.output, expected,
'Unpaired close tag should not cause errors.')

def test_nobr(self) -> None:
input = '<p>ab<nobr>cde</nobr>f</p>'
expected = '<p>ab<nobr>cde</nobr>f</p>'
resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
resolver.feed(input)
self.assertEqual(resolver.output, expected,
'WBR tags should not be inserted if in NOBR.')

def test_after_nobr(self) -> None:
input = '<p>ab<nobr>xy</nobr>abcdef</p>'
expected = '<p>ab<nobr>xy</nobr>abc<wbr>def</p>'
resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
resolver.feed(input)
self.assertEqual(resolver.output, expected,
'WBR tags should be inserted if after NOBR.')

def test_img_in_nobr(self) -> None:
input = '<p>ab<nobr>x<img>y</nobr>abcdef</p>'
expected = '<p>ab<nobr>x<img>y</nobr>abc<wbr>def</p>'
resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
resolver.feed(input)
self.assertEqual(resolver.output, expected,
'WBR tags should not be inserted if NOBR.')


class TestResolve(unittest.TestCase):

Expand Down

0 comments on commit 0df2fb2

Please sign in to comment.