diff --git a/readme.md b/readme.md index 1da2753..076bba3 100644 --- a/readme.md +++ b/readme.md @@ -5,6 +5,7 @@ [![Downloads](https://img.shields.io/nuget/dt/Devlooped.Web.svg?color=green)](https://www.nuget.org/packages/Devlooped.Web) [![License](https://img.shields.io/github/license/devlooped/web.svg?color=blue)](https://github.com/devlooped/web/blob/main/license.txt) + Read HTML as XML and query it with CSS over XLinq. No need to learn an entirely new object model for a page 🤘. @@ -32,6 +33,29 @@ receiving an `HtmlReaderSettings`. The underlying parsing is performed by the amazing [SgmlReader](https://www.nuget.org/packages/Microsoft.Xml.SgmlReader) library by Microsoft's [Chris Lovett](http://lovettsoftware.com/). +In addition, the following extension methods make it easier to work +with XML documents where you want to query with CSS or XPath without +having to deal with XML namespaces: + +```csharp +using System.Xml; +using System.Xml.Linq; +using Devlooped.Web; + +var doc = XDocument.Load("doc.xml") +// Will remove all xmlns declarations, and allow querying elements +// as if none had namespaces, returns the root element +XElement nons = doc.RemoveNamespaces(); + +// Alternatively, you can also ignore at the XmlReader level +using var reader = XmlReader.Create("doc.xml").IgnoreNamespaces(); +doc = XDocument.Load(reader); + +// Finally, you can also skip elements at the reader level +using var reader = XmlReader.Create("doc.xml").SkipElements("foo", "bar"); +doc = XDocument.Load(reader); +``` + ## CSS At the moment, supports the following CSS selector features: @@ -76,6 +100,8 @@ Non-CSS features: * `[text()*=val]`: Represents an element whose text contents contains at least one instance of the substring "val". If "val" is the empty string then the selector does not represent anything. + + # Dogfooding [![CI Version](https://img.shields.io/endpoint?url=https://shields.kzu.io/vpre/Devlooped.Web/main&label=nuget.ci&color=brightgreen)](https://pkg.kzu.io/index.json) @@ -91,6 +117,7 @@ The versioning scheme for packages is: - Branch builds: *42.42.42-*`[BRANCH]`.`[COMMITS]` + # Sponsors diff --git a/src/Tests/XmlTests.cs b/src/Tests/XmlTests.cs new file mode 100644 index 0000000..c2c1178 --- /dev/null +++ b/src/Tests/XmlTests.cs @@ -0,0 +1,43 @@ +using System.Xml; +using System.Xml.Linq; +using System.Xml.XPath; + +namespace Tests; + +public record XmlTests(ITestOutputHelper Output) +{ + [Fact] + public void RemoveNamespacesFromElement() + { + var doc = XDocument.Load("package.opf"); + var nons = doc.Root!.RemoveNamespaces(); + + var xmlns = new XmlNamespaceManager(new NameTable()); + xmlns.AddNamespace("opf", "http://www.idpf.org/2007/opf"); + xmlns.AddNamespace("dc", "http://purl.org/dc/elements/1.1/"); + + var yearns = doc.XPathSelectElement("/opf:package/opf:metadata/opf:meta[@property='dcterms:date']", xmlns)?.Value; + + // NOTE: since we're at the element level now, we don't need to reference the root element + var year = nons.XPathSelectElement("/metadata/meta[@property='dcterms:date']")?.Value; + + //Output.WriteLine(doc.Root!.Elements().First().ToString()); + //Output.WriteLine(nons.Elements().First().ToString()); + + Assert.NotNull(yearns); + Assert.NotNull(year); + + Assert.Equal(yearns, year); + } + + [Fact] + public void RemoveElementsFromReader() + { + using var reader = XmlReader.Create("package.opf").SkipElements("manifest"); + var doc = XDocument.Load(reader); + + var all = XDocument.Load("package.opf"); + + Assert.NotEqual(doc.Root!.Elements().Count(), all.Root!.Elements().Count()); + } +} diff --git a/src/Tests/package.opf b/src/Tests/package.opf new file mode 100644 index 0000000..2cfb694 --- /dev/null +++ b/src/Tests/package.opf @@ -0,0 +1,229 @@ + + + + Gideon the Ninth + 9781250313171 + 15 + en-US + Tamsyn Muir + Tom Doherty Associates + urn:isbn:9781250313188 + 15 + All rights reserved + main + 2020-03-11T20:41:25Z + Tamsyn Muir + 2020 + auto + auto + + textual + textual,visual + structuralNavigation + This publication conforms to the EPUB Accessibility specification at WCAG Level A. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/Web/HtmlDocument.cs b/src/Web/HtmlDocument.cs index abc3260..5cd4044 100644 --- a/src/Web/HtmlDocument.cs +++ b/src/Web/HtmlDocument.cs @@ -197,84 +197,5 @@ static XmlReader Configure(SgmlReader reader, HtmlReaderSettings settings) return result; } - - /// - /// Removes all XML namespaces, since for HTML content it's typically - /// irrelevant. - /// - class IgnoreXmlNsReader : XmlWrappingReader - { - const string XmlNsNamespace = "http://www.w3.org/2000/xmlns/"; - - public IgnoreXmlNsReader(XmlReader baseReader) : base(baseReader) { } - - public override int AttributeCount - { - get - { - var count = 0; - for (var go = MoveToFirstAttribute(); go; go = MoveToNextAttribute()) - count++; - - return count; - } - } - - public override bool MoveToFirstAttribute() - { - var moved = base.MoveToFirstAttribute(); - while (moved && (IsXmlNs || IsLocalXmlNs)) - moved = MoveToNextAttribute(); - - if (!moved) - base.MoveToElement(); - - return moved; - } - - public override bool MoveToNextAttribute() - { - var moved = base.MoveToNextAttribute(); - while (moved && (IsXmlNs || IsLocalXmlNs)) - moved = MoveToNextAttribute(); - - return moved; - } - - /// - /// We only support the xml prefix, used for xml:lang and xml:space - /// built-in text handling in XHTML. - /// - public override string Prefix => base.Prefix == "xml" ? "xml" : ""; - - public override string NamespaceURI => Prefix == "xml" ? base.NamespaceURI : ""; - - bool IsXmlNs => base.NamespaceURI == XmlNsNamespace; - - bool IsLocalXmlNs => Prefix == "xmlns"; - } - - /// - /// Removes all XML namespaces, since for HTML content it's typically - /// irrelevant. - /// - class SkipElementsReader : XmlWrappingReader - { - readonly HashSet skipElements; - - public SkipElementsReader(XmlReader baseReader, string[] skipElements) : base(baseReader) - { - this.skipElements = new HashSet(skipElements, StringComparer.OrdinalIgnoreCase); - } - - public override bool Read() - { - var read = base.Read(); - if (read && base.NodeType == XmlNodeType.Element && skipElements.Contains(LocalName)) - base.Skip(); - - return read; - } - } } diff --git a/src/Web/IgnoreXmlNsReader.cs b/src/Web/IgnoreXmlNsReader.cs new file mode 100644 index 0000000..1b9c491 --- /dev/null +++ b/src/Web/IgnoreXmlNsReader.cs @@ -0,0 +1,60 @@ +using System.Xml; + +namespace Devlooped.Web; + +/// +/// Removes all XML namespaces, since for HTML content it's typically +/// irrelevant. +/// +class IgnoreXmlNsReader : XmlWrappingReader +{ + const string XmlNsNamespace = "http://www.w3.org/2000/xmlns/"; + + public IgnoreXmlNsReader(XmlReader baseReader) : base(baseReader) { } + + public override int AttributeCount + { + get + { + var count = 0; + for (var go = MoveToFirstAttribute(); go; go = MoveToNextAttribute()) + count++; + + return count; + } + } + + public override bool MoveToFirstAttribute() + { + var moved = base.MoveToFirstAttribute(); + while (moved && (IsXmlNs || IsLocalXmlNs)) + moved = MoveToNextAttribute(); + + if (!moved) + base.MoveToElement(); + + return moved; + } + + public override bool MoveToNextAttribute() + { + var moved = base.MoveToNextAttribute(); + while (moved && (IsXmlNs || IsLocalXmlNs)) + moved = MoveToNextAttribute(); + + return moved; + } + + /// + /// We only support the xml prefix, used for xml:lang and xml:space + /// built-in text handling in XHTML. + /// + public override string Prefix => base.Prefix == "xml" ? "xml" : ""; + + public override string NamespaceURI => Prefix == "xml" ? base.NamespaceURI : ""; + + bool IsXmlNs => base.NamespaceURI == XmlNsNamespace; + + bool IsLocalXmlNs => Prefix == "xmlns"; +} + diff --git a/src/Web/SkipElementsReader.cs b/src/Web/SkipElementsReader.cs new file mode 100644 index 0000000..bdbb029 --- /dev/null +++ b/src/Web/SkipElementsReader.cs @@ -0,0 +1,27 @@ +using System; +using System.Collections.Generic; +using System.Xml; + +namespace Devlooped.Web; + +/// +/// Ignores specific elements from the input XML. +/// +class SkipElementsReader : XmlWrappingReader +{ + readonly HashSet skipElements; + + public SkipElementsReader(XmlReader baseReader, string[] skipElements) : base(baseReader) + { + this.skipElements = new HashSet(skipElements, StringComparer.OrdinalIgnoreCase); + } + + public override bool Read() + { + var read = base.Read(); + if (read && base.NodeType == XmlNodeType.Element && skipElements.Contains(LocalName)) + base.Skip(); + + return read; + } +} diff --git a/src/Web/Web.csproj b/src/Web/Web.csproj index 302c892..b69d996 100644 --- a/src/Web/Web.csproj +++ b/src/Web/Web.csproj @@ -20,7 +20,6 @@ - \ No newline at end of file diff --git a/src/Web/XmlExtensions.cs b/src/Web/XmlExtensions.cs new file mode 100644 index 0000000..dd6da5a --- /dev/null +++ b/src/Web/XmlExtensions.cs @@ -0,0 +1,48 @@ +using System.ComponentModel; +using Devlooped.Web; + +namespace System.Xml +{ + /// + /// Extension methods for . + /// + [EditorBrowsable(EditorBrowsableState.Never)] + public static class XmlReaderExtensions + { + /// + /// Creates a wrapping reader that ignores all XML namespace declarations, + /// so that all resulting elements and attributes have no namespaces. + /// + public static XmlReader IgnoreNamespaces(this XmlReader reader) + => new IgnoreXmlNsReader(reader); + + /// + /// Creates a wrapping reader that skips elements (and their child nodes) with + /// the given local names (without namespace, if any). + /// + public static XmlReader SkipElements(this XmlReader reader, params string[] localNames) + => new SkipElementsReader(reader, localNames); + } +} + +namespace System.Xml.Linq +{ + /// + /// Extension methods for . + /// + [EditorBrowsable(EditorBrowsableState.Never)] + public static class XElementExtensions + { + /// + /// Returns a clone of the node, with XML namespaces removed. + /// + public static XElement RemoveNamespaces(this XElement element) + => XElement.Load(element.CreateReader().IgnoreNamespaces()); + + /// + /// Returns a clone of the root node, with XML namespaces removed. + /// + public static XElement? RemoveNamespaces(this XDocument document) + => document.Root == null ? null : XElement.Load(document.Root.CreateReader().IgnoreNamespaces()); + } +} \ No newline at end of file diff --git a/src/Web/readme.md b/src/Web/readme.md new file mode 100644 index 0000000..8104118 --- /dev/null +++ b/src/Web/readme.md @@ -0,0 +1,2 @@ + + \ No newline at end of file