Fix unpaired close tags and self-closing tags

google#251 assumed that all tags are closed properly. This assumption doesn't stand for cases like: 1. Self-closing tags such as `<img>` don't have corresponding close tags. 2. Unpaired close tags are still valid HTML. This patch supports these cases by assuming all open tags that doesn't nest correctly or that doesn't close are automatically closed. This isn't the full HTML "adoption agency algorithm", but it should be good enough for the needs of BudouX. Fixes google#355
kojiishi · Nov 10, 2023 · 0df2fb2 · 0df2fb2
1 parent 0d812b2
commit 0df2fb2
Show file tree

Hide file tree

Showing 2 changed files with 57 additions and 3 deletions.
diff --git a/budoux/html_processor.py b/budoux/html_processor.py
@@ -29,6 +29,18 @@
   SKIP_NODES: typing.Set[str] = set(json.load(f))
 
 
+class ElementState(object):
+  """Represents the state for an element.
+
+  Attributes:
+    tag (str): The tag name.
+    to_skip (bool): Whether the content should be skipped or not.
+  """
+  def __init__(self, tag: str, to_skip: bool) -> None:
+    self.tag = tag
+    self.to_skip = to_skip
+
+
 class TextContentExtractor(HTMLParser):
   """An HTML parser to extract text content.
 
@@ -61,7 +73,7 @@ def __init__(self, chunks: typing.List[str], separator: str):
     self.separator = separator
     self.to_skip = False
     self.scan_index = 0
-    self.element_stack: queue.LifoQueue[bool] = queue.LifoQueue()
+    self.element_stack: queue.LifoQueue[ElementState] = queue.LifoQueue()
 
   def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
     attr_pairs = []
@@ -71,7 +83,7 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
       else:
         attr_pairs.append(' %s="%s"' % (attr[0], attr[1]))
     encoded_attrs = ''.join(attr_pairs)
-    self.element_stack.put(self.to_skip)
+    self.element_stack.put(ElementState(tag, self.to_skip))
     if tag.upper() in SKIP_NODES:
       if not self.to_skip and self.chunks_joined[self.scan_index] == SEP:
         self.scan_index += 1
@@ -81,7 +93,17 @@ def handle_starttag(self, tag: str, attrs: HTMLAttr) -> None:
 
   def handle_endtag(self, tag: str) -> None:
     self.output += '</%s>' % (tag)
-    self.to_skip = self.element_stack.get_nowait()
+    while not self.element_stack.empty():
+      state = self.element_stack.get_nowait()
+      if state.tag == tag:
+        self.to_skip = state.to_skip
+        break
+      # If the close tag doesn't match the open tag, remove it and keep looking.
+      # This means that close tags close their corresponding open tags.
+      # e.g., `<span>abc<img>def</span>` or `<p>abc<span>def</p>` are both valid
+      # HTML as per the HTML spec.
+      # Note the HTML "adoption agency algorithm" isn't fully supported.
+      # See https://html.spec.whatwg.org/multipage/parsing.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
 
   def handle_data(self, data: str) -> None:
     for char in data:

diff --git a/tests/test_html_processor.py b/tests/test_html_processor.py
@@ -46,6 +46,38 @@ def test_output(self) -> None:
     self.assertEqual(resolver.output, expected,
                      'WBR tags should be inserted as specified by chunks.')
 
+  def test_unpaired(self) -> None:
+    input = '<p>abcdef</p></p>'
+    expected = '<p>abc<wbr>def</p></p>'
+    resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'Unpaired close tag should not cause errors.')
+
+  def test_nobr(self) -> None:
+    input = '<p>ab<nobr>cde</nobr>f</p>'
+    expected = '<p>ab<nobr>cde</nobr>f</p>'
+    resolver = html_processor.HTMLChunkResolver(['abc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'WBR tags should not be inserted if in NOBR.')
+
+  def test_after_nobr(self) -> None:
+    input = '<p>ab<nobr>xy</nobr>abcdef</p>'
+    expected = '<p>ab<nobr>xy</nobr>abc<wbr>def</p>'
+    resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'WBR tags should be inserted if after NOBR.')
+
+  def test_img_in_nobr(self) -> None:
+    input = '<p>ab<nobr>x<img>y</nobr>abcdef</p>'
+    expected = '<p>ab<nobr>x<img>y</nobr>abc<wbr>def</p>'
+    resolver = html_processor.HTMLChunkResolver(['abxyabc', 'def'], '<wbr>')
+    resolver.feed(input)
+    self.assertEqual(resolver.output, expected,
+                     'WBR tags should not be inserted if NOBR.')
+
 
 class TestResolve(unittest.TestCase):