diff --git a/CHANGES.rst b/CHANGES.rst index ac08ae1f..12c18182 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -6,6 +6,10 @@ Change Log Released on XXX, 2013 +* Removed ``dom2sax`` from DOM treebuilders. It has been replaced by + ``treeadapters.sax.to_sax`` which is generic and supports any + treewalker; it also resolves all known bugs with ``dom2sax``. + 1.0b1 ~~~~~ diff --git a/html5lib/constants.py b/html5lib/constants.py index 1866dd78..e7089846 100644 --- a/html5lib/constants.py +++ b/html5lib/constants.py @@ -433,6 +433,24 @@ (namespaces["mathml"], "mtext") )) +adjustForeignAttributes = { + "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), + "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), + "xlink:href": ("xlink", "href", namespaces["xlink"]), + "xlink:role": ("xlink", "role", namespaces["xlink"]), + "xlink:show": ("xlink", "show", namespaces["xlink"]), + "xlink:title": ("xlink", "title", namespaces["xlink"]), + "xlink:type": ("xlink", "type", namespaces["xlink"]), + "xml:base": ("xml", "base", namespaces["xml"]), + "xml:lang": ("xml", "lang", namespaces["xml"]), + "xml:space": ("xml", "space", namespaces["xml"]), + "xmlns": (None, "xmlns", namespaces["xmlns"]), + "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"]) +} + +unadjustForeignAttributes = dict([((ns, local), qname) for qname, (prefix, local, ns) in + adjustForeignAttributes.items()]) + spaceCharacters = frozenset(( "\t", "\n", diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 22c2b75c..0518c410 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -17,6 +17,7 @@ from .constants import cdataElements, rcdataElements from .constants import tokenTypes, ReparseException, namespaces from .constants import htmlIntegrationPointElements, mathmlTextIntegrationPointElements +from .constants import adjustForeignAttributes as adjustForeignAttributesMap def parse(doc, treebuilder="etree", encoding=None, @@ -333,20 +334,7 @@ def adjustSVGAttributes(self, token): del token["data"][originalName] def adjustForeignAttributes(self, token): - replacements = { - "xlink:actuate": ("xlink", "actuate", namespaces["xlink"]), - "xlink:arcrole": ("xlink", "arcrole", namespaces["xlink"]), - "xlink:href": ("xlink", "href", namespaces["xlink"]), - "xlink:role": ("xlink", "role", namespaces["xlink"]), - "xlink:show": ("xlink", "show", namespaces["xlink"]), - "xlink:title": ("xlink", "title", namespaces["xlink"]), - "xlink:type": ("xlink", "type", namespaces["xlink"]), - "xml:base": ("xml", "base", namespaces["xml"]), - "xml:lang": ("xml", "lang", namespaces["xml"]), - "xml:space": ("xml", "space", namespaces["xml"]), - "xmlns": (None, "xmlns", namespaces["xmlns"]), - "xmlns:xlink": ("xmlns", "xlink", namespaces["xmlns"]) - } + replacements = adjustForeignAttributesMap for originalName in token["data"].keys(): if originalName in replacements: diff --git a/html5lib/treeadapters/__init__.py b/html5lib/treeadapters/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/html5lib/treeadapters/sax.py b/html5lib/treeadapters/sax.py new file mode 100644 index 00000000..ad47df95 --- /dev/null +++ b/html5lib/treeadapters/sax.py @@ -0,0 +1,44 @@ +from __future__ import absolute_import, division, unicode_literals + +from xml.sax.xmlreader import AttributesNSImpl + +from ..constants import adjustForeignAttributes, unadjustForeignAttributes + +prefix_mapping = {} +for prefix, localName, namespace in adjustForeignAttributes.values(): + if prefix is not None: + prefix_mapping[prefix] = namespace + + +def to_sax(walker, handler): + """Call SAX-like content handler based on treewalker walker""" + handler.startDocument() + for prefix, namespace in prefix_mapping.items(): + handler.startPrefixMapping(prefix, namespace) + + for token in walker: + type = token["type"] + if type == "Doctype": + continue + elif type in ("StartTag", "EmptyTag"): + attrs = AttributesNSImpl(token["data"], + unadjustForeignAttributes) + handler.startElementNS((token["namespace"], token["name"]), + token["name"], + attrs) + if type == "EmptyTag": + handler.endElementNS((token["namespace"], token["name"]), + token["name"]) + elif type == "EndTag": + handler.endElementNS((token["namespace"], token["name"]), + token["name"]) + elif type in ("Characters", "SpaceCharacters"): + handler.characters(token["data"]) + elif type == "Comment": + pass + else: + assert False, "Unknown token type" + + for prefix, namespace in prefix_mapping.items(): + handler.endPrefixMapping(prefix) + handler.endDocument() diff --git a/html5lib/treebuilders/dom.py b/html5lib/treebuilders/dom.py index f9e0d76e..61e5ed79 100644 --- a/html5lib/treebuilders/dom.py +++ b/html5lib/treebuilders/dom.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, unicode_literals -from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE +from xml.dom import minidom, Node import weakref from . import _base @@ -220,69 +220,6 @@ def serializeElement(element, indent=0): return "\n".join(rv) - def dom2sax(node, handler, nsmap={'xml': XML_NAMESPACE}): - if node.nodeType == Node.ELEMENT_NODE: - if not nsmap: - handler.startElement(node.nodeName, node.attributes) - for child in node.childNodes: - dom2sax(child, handler, nsmap) - handler.endElement(node.nodeName) - else: - attributes = dict(node.attributes.itemsNS()) - - # gather namespace declarations - prefixes = [] - for attrname in list(node.attributes.keys()): - attr = node.getAttributeNode(attrname) - if (attr.namespaceURI == XMLNS_NAMESPACE or - (attr.namespaceURI is None and attr.nodeName.startswith('xmlns'))): - prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None) - handler.startPrefixMapping(prefix, attr.nodeValue) - prefixes.append(prefix) - nsmap = nsmap.copy() - nsmap[prefix] = attr.nodeValue - del attributes[(attr.namespaceURI, attr.nodeName)] - - # apply namespace declarations - for attrname in list(node.attributes.keys()): - attr = node.getAttributeNode(attrname) - if attr.namespaceURI is None and ':' in attr.nodeName: - prefix = attr.nodeName.split(':')[0] - if prefix in nsmap: - del attributes[(attr.namespaceURI, attr.nodeName)] - attributes[(nsmap[prefix], attr.nodeName)] = attr.nodeValue - - # SAX events - ns = node.namespaceURI or nsmap.get(None, None) - handler.startElementNS((ns, node.nodeName), node.nodeName, attributes) - for child in node.childNodes: - dom2sax(child, handler, nsmap) - handler.endElementNS((ns, node.nodeName), node.nodeName) - for prefix in prefixes: - handler.endPrefixMapping(prefix) - - elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]: - handler.characters(node.nodeValue) - - elif node.nodeType == Node.DOCUMENT_NODE: - handler.startDocument() - for child in node.childNodes: - dom2sax(child, handler, nsmap) - handler.endDocument() - - elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE: - for child in node.childNodes: - dom2sax(child, handler, nsmap) - - else: - # ATTRIBUTE_NODE - # ENTITY_NODE - # PROCESSING_INSTRUCTION_NODE - # COMMENT_NODE - # DOCUMENT_TYPE_NODE - # NOTATION_NODE - pass - return locals()