Skip to content

Commit

Permalink
Fix html5lib#6: dom2sax crash by rewriting dom2sax, now using a treew…
Browse files Browse the repository at this point in the history
…alker

This moves the function to a new treeadapters module (where later
the adapters from test_treewalker.py will get moved). dom2sax
remains for backwards-compatibility, calling the new function.
  • Loading branch information
gsnedders committed May 5, 2013
1 parent b4a8a6f commit f49a37c
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 77 deletions.
18 changes: 18 additions & 0 deletions html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,24 @@
(namespaces["mathml"], "mtext")
))

adjustForeignAttributes = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
"xlink:href": ("xlink", "href", namespaces["xlink"]),
"xlink:role": ("xlink", "role", namespaces["xlink"]),
"xlink:show": ("xlink", "show", namespaces["xlink"]),
"xlink:title": ("xlink", "title", namespaces["xlink"]),
"xlink:type": ("xlink", "type", namespaces["xlink"]),
"xml:base": ("xml", "base", namespaces["xml"]),
"xml:lang": ("xml", "lang", namespaces["xml"]),
"xml:space": ("xml", "space", namespaces["xml"]),
"xmlns": (None, "xmlns", namespaces["xmlns"]),
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}

unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in
adjustForeignAttributes.items()])

spaceCharacters = frozenset((
"\t",
"\n",
Expand Down
16 changes: 2 additions & 14 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from .constants import cdataElements, rcdataElements
from .constants import tokenTypes, ReparseException, namespaces
from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements
from .constants import adjustForeignAttributes as adjustForeignAttributesMap


def parse(doc, treebuilder="simpletree", encoding=None,
Expand Down Expand Up @@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token):
del token["data"][originalName]

def adjustForeignAttributes(self, token):
replacements = {
"xlink:actuate": ("xlink", "actuate", namespaces["xlink"]),
"xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]),
"xlink:href": ("xlink", "href", namespaces["xlink"]),
"xlink:role": ("xlink", "role", namespaces["xlink"]),
"xlink:show": ("xlink", "show", namespaces["xlink"]),
"xlink:title": ("xlink", "title", namespaces["xlink"]),
"xlink:type": ("xlink", "type", namespaces["xlink"]),
"xml:base": ("xml", "base", namespaces["xml"]),
"xml:lang": ("xml", "lang", namespaces["xml"]),
"xml:space": ("xml", "space", namespaces["xml"]),
"xmlns": (None, "xmlns", namespaces["xmlns"]),
"xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"])
}
replacements = adjustForeignAttributesMap

for originalName in token["data"].keys():
if originalName in replacements:
Expand Down
Empty file.
44 changes: 44 additions & 0 deletions html5lib/treeadapters/sax.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from __future__ import absolute_import, division, unicode_literals

from xml.sax.xmlreader import AttributesNSImpl

from ..constants import adjustForeignAttributes, unadjustForeignAttributes

prefix_mapping = {}
for prefix, localName, namespace in adjustForeignAttributes.values():
if prefix is not None:
prefix_mapping[prefix] = namespace


def to_sax(walker, handler):
"""Call SAX-like content handler based on treewalker walker"""
handler.startDocument()
for prefix, namespace in prefix_mapping.items():
handler.startPrefixMapping(prefix, namespace)

for token in walker:
type = token["type"]
if type == "Doctype":
continue
elif type in ("StartTag", "EmptyTag"):
attrs = AttributesNSImpl(token["data"],
unadjustForeignAttributes)
handler.startElementNS((token["namespace"], token["name"]),
token["name"],
attrs)
if type == "EmptyTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type == "EndTag":
handler.endElementNS((token["namespace"], token["name"]),
token["name"])
elif type in ("Characters", "SpaceCharacters"):
handler.characters(token["data"])
elif type == "Comment":
pass
else:
assert False, "Unknown token type"

for prefix, namespace in prefix_mapping.items():
handler.endPrefixMapping(prefix)
handler.endDocument()
70 changes: 7 additions & 63 deletions html5lib/treebuilders/dom.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
from __future__ import absolute_import, division, unicode_literals


from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
from xml.dom import minidom, Node
import weakref

from . import _base
from .. import constants
from ..constants import namespaces
from ..treeadapters import sax
from ..treewalkers import getTreeWalker
from ..utils import moduleFactoryFactory


Expand Down Expand Up @@ -219,68 +221,10 @@ def serializeElement(element, indent=0):

return "\n".join(rv)

def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}):
if node.nodeType == Node.ELEMENT_NODE:
if not nsmap:
handler.startElement(node.nodeName, node.attributes)
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endElement(node.nodeName)
else:
attributes = dict(node.attributes.itemsNS())

# gather namespace declarations
prefixes = []
for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.nodeName)]

# apply namespace declarations
for attrname in list(node.attributes.keys()):
attr = node.getAttributeNode(attrname)
if attr.namespaceURI is None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if prefix in nsmap:
del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue

# SAX events
ns = node.namespaceURI or nsmap.get(None, None)
handler.startElementNS((ns, node.nodeName), node.nodeName, attributes)
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endElementNS((ns, node.nodeName), node.nodeName)
for prefix in prefixes:
handler.endPrefixMapping(prefix)

elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
handler.characters(node.nodeValue)

elif node.nodeType == Node.DOCUMENT_NODE:
handler.startDocument()
for child in node.childNodes:
dom2sax(child, handler, nsmap)
handler.endDocument()

elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
for child in node.childNodes:
dom2sax(child, handler, nsmap)

else:
# ATTRIBUTE_NODE
# ENTITY_NODE
# PROCESSING_INSTRUCTION_NODE
# COMMENT_NODE
# DOCUMENT_TYPE_NODE
# NOTATION_NODE
pass
def dom2sax(node, handler, nsmap=None):
treewalker = getTreeWalker("dom", implementation=Dom)
walker = treewalker(node)
sax.to_sax(walker, handler)

return locals()

Expand Down

0 comments on commit f49a37c

Please sign in to comment.