Skip to content

Commit

Permalink
Fix html5lib#6: dom2sax crash by replacing dom2sax with a generic to_sax
Browse files Browse the repository at this point in the history
This moves the functionality to a new treeadapters module (where
later the adapters from test_treewalker.py will get moved) and
removes the previous dom2sax function.
  • Loading branch information
gsnedders committed Jun 16, 2013
1 parent 9bc70e6 commit 3ff5e72
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 78 deletions.
4 changes: 4 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ Released on May 17, 2013
longer supported. ``html5lib.treebuilders.getTreeBuilder("dom")`` will
return the default DOM treebuilder, which uses ``xml.dom.minidom``.

* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by
``treeadapters.sax.to_sax`` which is generic and supports any
treewalker; it also resolves all known bugs with ``dom2sax``.

* Optional heuristic character encoding detection now based on
``charade`` for Python 2.6 - 3.3 compatibility.

Expand Down
18 changes: 18 additions & 0 deletions html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,24 @@
(namespaces["mathml"], "mtext")
))

adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
"xlink:href": ("xlink", "href", namespaces["xlink"]),
"xlink:role": ("xlink", "role", namespaces["xlink"]),
"xlink:show": ("xlink", "show", namespaces["xlink"]),
"xlink:title": ("xlink", "title", namespaces["xlink"]),
"xlink:type": ("xlink", "type", namespaces["xlink"]),
"xml:base": ("xml", "base", namespaces["xml"]),
"xml:lang": ("xml", "lang", namespaces["xml"]),
"xml:space": ("xml", "space", namespaces["xml"]),
"xmlns": (None, "xmlns", namespaces["xmlns"]),
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}

unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
adjustForeignAttributes.items()])

spaceCharacters = frozenset((
"\t",
"\n",
Expand Down
16 changes: 2 additions & 14 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
from .constants import adjustForeignAttributes as adjustForeignAttributesMap


def parse(doc, treebuilder="etree", encoding=None,
Expand Down Expand Up @@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token):
del token["data"][originalName]

def adjustForeignAttributes(self, token):
replacements = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
"xlink:href": ("xlink", "href", namespaces["xlink"]),
"xlink:role": ("xlink", "role", namespaces["xlink"]),
"xlink:show": ("xlink", "show", namespaces["xlink"]),
"xlink:title": ("xlink", "title", namespaces["xlink"]),
"xlink:type": ("xlink", "type", namespaces["xlink"]),
"xml:base": ("xml", "base", namespaces["xml"]),
"xml:lang": ("xml", "lang", namespaces["xml"]),
"xml:space": ("xml", "space", namespaces["xml"]),
"xmlns": (None, "xmlns", namespaces["xmlns"]),
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}
replacements = adjustForeignAttributesMap

for originalName in token["data"].keys():
if originalName in replacements:
Expand Down
Empty file.
44 changes: 44 additions & 0 deletions html5lib/treeadapters/sax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals

from xml.sax.xmlreader import AttributesNSImpl

from ..constants import adjustForeignAttributes, unadjustForeignAttributes

prefix_mapping = {}
for prefix, localName, namespace in adjustForeignAttributes.values():
if prefix is not None:
prefix_mapping[prefix] = namespace


def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker"""
handler.startDocument()
for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace)

for token in walker:
type = token["type"]
if type == "Doctype":
continue
elif type in ("StartTag", "EmptyTag"):
attrs = AttributesNSImpl(token["data"],
unadjustForeignAttributes)
handler.startElementNS((token["namespace"], token["name"]),
token["name"],
attrs)
if type == "EmptyTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type == "EndTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type in ("Characters", "SpaceCharacters"):
handler.characters(token["data"])
elif type == "Comment":
pass
else:
assert False, "Unknown token type"

for prefix, namespace in prefix_mapping.items():
handler.endPrefixMapping(prefix)
handler.endDocument()
65 changes: 1 addition & 64 deletions html5lib/treebuilders/dom.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import absolute_import, division, unicode_literals


from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
from xml.dom import minidom, Node
import weakref

from . import _base
Expand Down Expand Up @@ -220,69 +220,6 @@ def serializeElement(element, indent=0):

return "\n".join(rv)

def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())

# gather namespace declarations
prefixes = []
for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.nodeName)]

# apply namespace declarations
for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if attr.namespaceURI is None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if prefix in nsmap:
del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue

# SAX events
ns = node.namespaceURI or nsmap.get(None, None)
handler.startElementNS((ns, node.nodeName), node.nodeName, attributes)
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes:
handler.endPrefixMapping(prefix)

elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)

elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endDocument()

elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes:
dom2sax(child, handler, nsmap)

else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass

return locals()


Expand Down

0 comments on commit 3ff5e72

Please sign in to comment.