Account for Etree Elements in HTML Stash

By calling str on all stash elements we ensure they don't raise an error. Worse case, soemthing like `<Element 'div' at 0x000001B2DAE94900>` gets inserted into the output. However, with the override in the md_in_html extension, we actually serialize and reinsert the original HTML. Worse case, an HTML block which should be parsed as Markdown gets skipped by the extension (`<div markdown="block"></div>` gets inserting into the output). The tricky part is testing as there should be no known cases where this ever occurs. Therefore, we forefully pass an etree Element directly to the method in the test. That said, as Python-Markdown#1040 is unresolved at this point, I have tested locally with a real existing case and it works well. Related to Python-Markdown#1040.
waylan · Oct 14, 2020 · 7b099ad · 7b099ad
1 parent b4a399c
commit 7b099ad
Show file tree

Hide file tree

Showing 3 changed files with 33 additions and 2 deletions.
diff --git a/markdown/extensions/md_in_html.py b/markdown/extensions/md_in_html.py
@@ -17,6 +17,7 @@
 from . import Extension
 from ..blockprocessors import BlockProcessor
 from ..preprocessors import Preprocessor
+from ..postprocessors import RawHtmlPostprocessor
 from .. import util
 from ..htmlparser import HTMLExtractor
 import xml.etree.ElementTree as etree
@@ -263,6 +264,15 @@ def run(self, parent, blocks):
         return False
 
 
+class MarkdownInHTMLPostprocessor(RawHtmlPostprocessor):
+    def stash_to_string(self, text):
+        """ Override default to handle any etree elements still in the stash. """
+        if isinstance(text, etree.Element):
+            return self.md.serializer(text)
+        else:
+            return str(text)
+
+
 class MarkdownInHtmlExtension(Extension):
     """Add Markdown parsing in HTML to Markdown class."""
 
@@ -275,6 +285,8 @@ def extendMarkdown(self, md):
         md.parser.blockprocessors.register(
             MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
         )
+        # Replace raw HTML postprocessor
+        md.postprocessors.register(MarkdownInHTMLPostprocessor(md), 'raw_html', 30)
 
 
 def makeExtension(**kwargs):  # pragma: no cover

diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py
@@ -69,7 +69,7 @@ def run(self, text):
         """ Iterate over html stash and restore html. """
         replacements = OrderedDict()
         for i in range(self.md.htmlStash.html_counter):
-            html = self.md.htmlStash.rawHtmlBlocks[i]
+            html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
             if self.isblocklevel(html):
                 replacements["<p>{}</p>".format(
                     self.md.htmlStash.get_placeholder(i))] = html
@@ -95,6 +95,10 @@ def isblocklevel(self, html):
             return self.md.is_block_level(m.group(1))
         return False
 
+    def stash_to_string(self, text):
+        """ Convert a stashed object to a string. """
+        return str(text)
+
 
 class AndSubstitutePostprocessor(Postprocessor):
     """ Restore valid entities """

diff --git a/tests/test_syntax/extensions/test_md_in_html.py b/tests/test_syntax/extensions/test_md_in_html.py
@@ -23,6 +23,21 @@
 from unittest import TestSuite
 from markdown.test_tools import TestCase
 from ..blocks.test_html_blocks import TestHTMLBlocks
+from markdown import Markdown
+from xml.etree.ElementTree import Element
+
+
+class TestMarkdownInHTMLPostProcessor(TestCase):
+    """ Ensure any remaining elements in HTML stash are properly serialized. """
+
+    def test_stash_to_string(self):
+        # There should be no known cases where this actually happens so we need to
+        # forcefully pass an etree Element to the method to ensure proper behavior.
+        element = Element('div')
+        element.text = 'Foo bar.'
+        md = Markdown(extensions=['md_in_html'])
+        result = md.postprocessors['raw_html'].stash_to_string(element)
+        self.assertEqual(result, '<div>Foo bar.</div>')
 
 
 class TestDefaultwMdInHTML(TestHTMLBlocks):
@@ -758,7 +773,7 @@ def test_md1_nested_footnote_ref(self):
 def load_tests(loader, tests, pattern):
     ''' Ensure TestHTMLBlocks doesn't get run twice by excluding it here. '''
     suite = TestSuite()
-    for test_class in [TestDefaultwMdInHTML, TestMdInHTML]:
+    for test_class in [TestDefaultwMdInHTML, TestMdInHTML, TestMarkdownInHTMLPostProcessor]:
         tests = loader.loadTestsFromTestCase(test_class)
         suite.addTests(tests)
     return suite