From 156f2eae94e290192bc75b412a05952bea9d0895 Mon Sep 17 00:00:00 2001 From: Leonid Dubinsky Date: Fri, 10 Sep 2021 14:50:32 -0400 Subject: [PATCH] XML allows for comments and processing instructions to be present before the start and after the end of the root element. Currently, `FactoryAdapter` does not capture those nodes, and `XMLLoader.loadXML` does not provide access to anything other than the root element anyway. This pull request addresses the issue. Note: at least with the JDK's Xerces, whitespace in the prolog and epilogue gets lost in parsing: the parser does not fire any white-space related events. --- jvm/src/test/scala/scala/xml/XMLTest.scala | 31 +++++++++++----- .../scala/scala/xml/factory/XMLLoader.scala | 35 +++++++++++++++---- .../scala/xml/parsing/FactoryAdapter.scala | 31 +++++++++++----- .../scala/xml/parsing/MarkupParser.scala | 2 ++ 4 files changed, 75 insertions(+), 24 deletions(-) diff --git a/jvm/src/test/scala/scala/xml/XMLTest.scala b/jvm/src/test/scala/scala/xml/XMLTest.scala index 23dae29a..e7f3624f 100644 --- a/jvm/src/test/scala/scala/xml/XMLTest.scala +++ b/jvm/src/test/scala/scala/xml/XMLTest.scala @@ -586,7 +586,7 @@ class XMLTestJVM { def issue508commentParsing: Unit = { // confirm that comments are processed correctly now roundtrip(" suffix") - roundtrip("prefix suffix") + roundtrip("prefix suffix") roundtrip("prefix suffix") roundtrip("prefix suffix") roundtrip("""prefix text") - //check("text") + roundtrip("prefix suffix") } @UnitTest @@ -613,7 +607,26 @@ class XMLTestJVM { roundtrip("""prefix suffix""".stripMargin) + | section]]> suffix""".stripMargin) + } + + def roundtripNodes(xml: String): Unit = assertEquals(xml, XML.loadStringNodes(xml).map(_.toString).mkString("")) + + @UnitTest + def xmlLoaderLoadNodes: Unit = { + roundtripNodes("text") + roundtripNodes("text") + roundtripNodes("""text""".stripMargin) + + roundtripNodes("text") + roundtripNodes("text") + + // Note: at least with the JDK's Xerces, whitespace in the prolog and epilogue gets lost in parsing: + // the parser does not fire any white-space related events, so: + // does not work: roundtripNodes(" ") + // does not work: roundtripNodes(" ") } @UnitTest diff --git a/shared/src/main/scala/scala/xml/factory/XMLLoader.scala b/shared/src/main/scala/scala/xml/factory/XMLLoader.scala index 7562c0c1..620e1b6e 100644 --- a/shared/src/main/scala/scala/xml/factory/XMLLoader.scala +++ b/shared/src/main/scala/scala/xml/factory/XMLLoader.scala @@ -51,19 +51,29 @@ trait XMLLoader[T <: Node] { * The methods available in scala.xml.XML use the XML parser in the JDK. */ def loadXML(source: InputSource, parser: SAXParser): T = { - val newAdapter = adapter + val result: FactoryAdapter = parse(source, parser) + result.rootElem.asInstanceOf[T] + } + + def loadXMLNodes(source: InputSource, parser: SAXParser): Seq[Node] = { + val result: FactoryAdapter = parse(source, parser) + result.prolog ++ (result.rootElem :: result.epilogue) + } + + private def parse(source: InputSource, parser: SAXParser): FactoryAdapter = { + val result: FactoryAdapter = adapter try { - parser.setProperty("http://xml.org/sax/properties/lexical-handler", newAdapter) + parser.setProperty("http://xml.org/sax/properties/lexical-handler", result) } catch { case _: SAXNotRecognizedException => } - newAdapter.scopeStack = TopScope :: newAdapter.scopeStack - parser.parse(source, newAdapter) - newAdapter.scopeStack = newAdapter.scopeStack.tail + result.scopeStack = TopScope :: result.scopeStack + parser.parse(source, result) + result.scopeStack = result.scopeStack.tail - newAdapter.rootElem.asInstanceOf[T] + result } /** Loads XML from the given file, file descriptor, or filename. */ @@ -80,4 +90,15 @@ trait XMLLoader[T <: Node] { /** Loads XML from the given String. */ def loadString(string: String): T = loadXML(fromString(string), parser) -} \ No newline at end of file + + /** Load XML nodes, including comments and processing instructions that precede and follow the root element. */ + def loadFileNodes(file: File): Seq[Node] = loadXMLNodes(fromFile(file), parser) + def loadFileNodes(fd: FileDescriptor): Seq[Node] = loadXMLNodes(fromFile(fd), parser) + def loadFileNodes(name: String): Seq[Node] = loadXMLNodes(fromFile(name), parser) + def loadNodes(is: InputStream): Seq[Node] = loadXMLNodes(fromInputStream(is), parser) + def loadNodes(reader: Reader): Seq[Node] = loadXMLNodes(fromReader(reader), parser) + def loadNodes(sysID: String): Seq[Node] = loadXMLNodes(fromSysId(sysID), parser) + def loadNodes(source: InputSource): Seq[Node] = loadXMLNodes(source, parser) + def loadNodes(url: URL): Seq[Node] = loadXMLNodes(fromInputStream(url.openStream()), parser) + def loadStringNodes(string: String): Seq[Node] = loadXMLNodes(fromString(string), parser) +} diff --git a/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala b/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala index 1ac7e9f0..18d32633 100644 --- a/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala +++ b/shared/src/main/scala/scala/xml/parsing/FactoryAdapter.scala @@ -40,9 +40,11 @@ trait ConsoleErrorHandler extends DefaultHandler2 { * underlying SAX parser. */ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Node] { + var prolog: List[Node] = List.empty var rootElem: Node = _ + var epilogue: List[Node] = List.empty - val buffer = new StringBuilder() + val buffer: StringBuilder = new StringBuilder() private var inCDATA: Boolean = false /** List of attributes @@ -51,28 +53,28 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod * * @since 2.0.0 */ - var attribStack = List.empty[MetaData] + var attribStack: List[MetaData] = List.empty /** List of elements * * Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]]. * * @since 2.0.0 */ - var hStack = List.empty[Node] // [ element ] contains siblings + var hStack: List[Node] = List.empty // [ element ] contains siblings /** List of element names * * Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]]. * * @since 2.0.0 */ - var tagStack = List.empty[String] + var tagStack: List[String] = List.empty /** List of namespaces * * Previously was a mutable [[scala.collection.mutable.Stack Stack]], but is now a mutable reference to an immutable [[scala.collection.immutable.List List]]. * * @since 2.0.0 */ - var scopeStack = List.empty[NamespaceBinding] + var scopeStack: List[NamespaceBinding] = List.empty var curTag: String = _ var capture: Boolean = false @@ -123,7 +125,7 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod // ContentHandler methods // - val normalizeWhitespace = false + val normalizeWhitespace: Boolean = false /** * Capture characters, possibly normalizing whitespace. @@ -177,13 +179,20 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod attributes: Attributes): Unit = { captureText() + + // capture the prolog at the start of the root element + if (tagStack.isEmpty) { + prolog = hStack.reverse + hStack = List.empty + } + tagStack = curTag :: tagStack curTag = qname val localName = splitName(qname)._2 capture = nodeContainsText(localName) - hStack = null :: hStack + hStack = null :: hStack var m: MetaData = Null var scpe: NamespaceBinding = if (scopeStack.isEmpty) TopScope @@ -193,7 +202,7 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod val qname = attributes getQName i val value = attributes getValue i val (pre, key) = splitName(qname) - def nullIfEmpty(s: String) = if (s == "") null else s + def nullIfEmpty(s: String): String = if (s == "") null else s if (pre == "xmlns" || (pre == null && qname == "xmlns")) { val arg = if (pre == null) null else key @@ -250,6 +259,12 @@ abstract class FactoryAdapter extends DefaultHandler2 with factory.XMLLoader[Nod capture = curTag != null && nodeContainsText(curTag) // root level } + override def endDocument(): Unit = { + // capture the epilogue at the end of the document + epilogue = hStack.init.reverse + hStack = hStack.last :: Nil + } + /** * Processing instruction. */ diff --git a/shared/src/main/scala/scala/xml/parsing/MarkupParser.scala b/shared/src/main/scala/scala/xml/parsing/MarkupParser.scala index acd19485..60b7b5a4 100755 --- a/shared/src/main/scala/scala/xml/parsing/MarkupParser.scala +++ b/shared/src/main/scala/scala/xml/parsing/MarkupParser.scala @@ -98,6 +98,8 @@ trait MarkupParser extends MarkupParserCommon with TokenTests { var extIndex = -1 /** holds temporary values of pos */ + // Note: this is clearly an override, but if marked as such it causes a "...cannot override a mutable variable" + // error with Scala 3; does it work with Scala 3 if not explicitly marked as an override remains to be seen... var tmppos: Int = _ /** holds the next character */