diff --git a/readme.md b/readme.md
index 1da2753..076bba3 100644
--- a/readme.md
+++ b/readme.md
@@ -5,6 +5,7 @@
[![Downloads](https://img.shields.io/nuget/dt/Devlooped.Web.svg?color=green)](https://www.nuget.org/packages/Devlooped.Web)
[![License](https://img.shields.io/github/license/devlooped/web.svg?color=blue)](https://github.com/devlooped/web/blob/main/license.txt)
+
Read HTML as XML and query it with CSS over XLinq.
No need to learn an entirely new object model for a page 🤘.
@@ -32,6 +33,29 @@ receiving an `HtmlReaderSettings`.
The underlying parsing is performed by the amazing [SgmlReader](https://www.nuget.org/packages/Microsoft.Xml.SgmlReader)
library by Microsoft's [Chris Lovett](http://lovettsoftware.com/).
+In addition, the following extension methods make it easier to work
+with XML documents where you want to query with CSS or XPath without
+having to deal with XML namespaces:
+
+```csharp
+using System.Xml;
+using System.Xml.Linq;
+using Devlooped.Web;
+
+var doc = XDocument.Load("doc.xml")
+// Will remove all xmlns declarations, and allow querying elements
+// as if none had namespaces, returns the root element
+XElement nons = doc.RemoveNamespaces();
+
+// Alternatively, you can also ignore at the XmlReader level
+using var reader = XmlReader.Create("doc.xml").IgnoreNamespaces();
+doc = XDocument.Load(reader);
+
+// Finally, you can also skip elements at the reader level
+using var reader = XmlReader.Create("doc.xml").SkipElements("foo", "bar");
+doc = XDocument.Load(reader);
+```
+
## CSS
At the moment, supports the following CSS selector features:
@@ -76,6 +100,8 @@ Non-CSS features:
* `[text()*=val]`: Represents an element whose text contents contains at least one instance of the
substring "val". If "val" is the empty string then the selector does not represent anything.
+
+
# Dogfooding
[![CI Version](https://img.shields.io/endpoint?url=https://shields.kzu.io/vpre/Devlooped.Web/main&label=nuget.ci&color=brightgreen)](https://pkg.kzu.io/index.json)
@@ -91,6 +117,7 @@ The versioning scheme for packages is:
- Branch builds: *42.42.42-*`[BRANCH]`.`[COMMITS]`
+
# Sponsors
diff --git a/src/Tests/XmlTests.cs b/src/Tests/XmlTests.cs
new file mode 100644
index 0000000..c2c1178
--- /dev/null
+++ b/src/Tests/XmlTests.cs
@@ -0,0 +1,43 @@
+using System.Xml;
+using System.Xml.Linq;
+using System.Xml.XPath;
+
+namespace Tests;
+
+public record XmlTests(ITestOutputHelper Output)
+{
+ [Fact]
+ public void RemoveNamespacesFromElement()
+ {
+ var doc = XDocument.Load("package.opf");
+ var nons = doc.Root!.RemoveNamespaces();
+
+ var xmlns = new XmlNamespaceManager(new NameTable());
+ xmlns.AddNamespace("opf", "http://www.idpf.org/2007/opf");
+ xmlns.AddNamespace("dc", "http://purl.org/dc/elements/1.1/");
+
+ var yearns = doc.XPathSelectElement("/opf:package/opf:metadata/opf:meta[@property='dcterms:date']", xmlns)?.Value;
+
+ // NOTE: since we're at the element level now, we don't need to reference the root element
+ var year = nons.XPathSelectElement("/metadata/meta[@property='dcterms:date']")?.Value;
+
+ //Output.WriteLine(doc.Root!.Elements().First().ToString());
+ //Output.WriteLine(nons.Elements().First().ToString());
+
+ Assert.NotNull(yearns);
+ Assert.NotNull(year);
+
+ Assert.Equal(yearns, year);
+ }
+
+ [Fact]
+ public void RemoveElementsFromReader()
+ {
+ using var reader = XmlReader.Create("package.opf").SkipElements("manifest");
+ var doc = XDocument.Load(reader);
+
+ var all = XDocument.Load("package.opf");
+
+ Assert.NotEqual(doc.Root!.Elements().Count(), all.Root!.Elements().Count());
+ }
+}
diff --git a/src/Tests/package.opf b/src/Tests/package.opf
new file mode 100644
index 0000000..2cfb694
--- /dev/null
+++ b/src/Tests/package.opf
@@ -0,0 +1,229 @@
+
+
+
+ Gideon the Ninth
+ 9781250313171
+ 15
+ en-US
+ Tamsyn Muir
+ Tom Doherty Associates
+ urn:isbn:9781250313188
+ 15
+ All rights reserved
+ main
+ 2020-03-11T20:41:25Z
+ Tamsyn Muir
+ 2020
+ auto
+ auto
+
+ textual
+ textual,visual
+ structuralNavigation
+ This publication conforms to the EPUB Accessibility specification at WCAG Level A.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/Web/HtmlDocument.cs b/src/Web/HtmlDocument.cs
index abc3260..5cd4044 100644
--- a/src/Web/HtmlDocument.cs
+++ b/src/Web/HtmlDocument.cs
@@ -197,84 +197,5 @@ static XmlReader Configure(SgmlReader reader, HtmlReaderSettings settings)
return result;
}
-
- ///
- /// Removes all XML namespaces, since for HTML content it's typically
- /// irrelevant.
- ///
- class IgnoreXmlNsReader : XmlWrappingReader
- {
- const string XmlNsNamespace = "http://www.w3.org/2000/xmlns/";
-
- public IgnoreXmlNsReader(XmlReader baseReader) : base(baseReader) { }
-
- public override int AttributeCount
- {
- get
- {
- var count = 0;
- for (var go = MoveToFirstAttribute(); go; go = MoveToNextAttribute())
- count++;
-
- return count;
- }
- }
-
- public override bool MoveToFirstAttribute()
- {
- var moved = base.MoveToFirstAttribute();
- while (moved && (IsXmlNs || IsLocalXmlNs))
- moved = MoveToNextAttribute();
-
- if (!moved)
- base.MoveToElement();
-
- return moved;
- }
-
- public override bool MoveToNextAttribute()
- {
- var moved = base.MoveToNextAttribute();
- while (moved && (IsXmlNs || IsLocalXmlNs))
- moved = MoveToNextAttribute();
-
- return moved;
- }
-
- ///
- /// We only support the xml prefix, used for xml:lang and xml:space
- /// built-in text handling in XHTML.
- ///
- public override string Prefix => base.Prefix == "xml" ? "xml" : "";
-
- public override string NamespaceURI => Prefix == "xml" ? base.NamespaceURI : "";
-
- bool IsXmlNs => base.NamespaceURI == XmlNsNamespace;
-
- bool IsLocalXmlNs => Prefix == "xmlns";
- }
-
- ///
- /// Removes all XML namespaces, since for HTML content it's typically
- /// irrelevant.
- ///
- class SkipElementsReader : XmlWrappingReader
- {
- readonly HashSet skipElements;
-
- public SkipElementsReader(XmlReader baseReader, string[] skipElements) : base(baseReader)
- {
- this.skipElements = new HashSet(skipElements, StringComparer.OrdinalIgnoreCase);
- }
-
- public override bool Read()
- {
- var read = base.Read();
- if (read && base.NodeType == XmlNodeType.Element && skipElements.Contains(LocalName))
- base.Skip();
-
- return read;
- }
- }
}
diff --git a/src/Web/IgnoreXmlNsReader.cs b/src/Web/IgnoreXmlNsReader.cs
new file mode 100644
index 0000000..1b9c491
--- /dev/null
+++ b/src/Web/IgnoreXmlNsReader.cs
@@ -0,0 +1,60 @@
+using System.Xml;
+
+namespace Devlooped.Web;
+
+///
+/// Removes all XML namespaces, since for HTML content it's typically
+/// irrelevant.
+///
+class IgnoreXmlNsReader : XmlWrappingReader
+{
+ const string XmlNsNamespace = "http://www.w3.org/2000/xmlns/";
+
+ public IgnoreXmlNsReader(XmlReader baseReader) : base(baseReader) { }
+
+ public override int AttributeCount
+ {
+ get
+ {
+ var count = 0;
+ for (var go = MoveToFirstAttribute(); go; go = MoveToNextAttribute())
+ count++;
+
+ return count;
+ }
+ }
+
+ public override bool MoveToFirstAttribute()
+ {
+ var moved = base.MoveToFirstAttribute();
+ while (moved && (IsXmlNs || IsLocalXmlNs))
+ moved = MoveToNextAttribute();
+
+ if (!moved)
+ base.MoveToElement();
+
+ return moved;
+ }
+
+ public override bool MoveToNextAttribute()
+ {
+ var moved = base.MoveToNextAttribute();
+ while (moved && (IsXmlNs || IsLocalXmlNs))
+ moved = MoveToNextAttribute();
+
+ return moved;
+ }
+
+ ///
+ /// We only support the xml prefix, used for xml:lang and xml:space
+ /// built-in text handling in XHTML.
+ ///
+ public override string Prefix => base.Prefix == "xml" ? "xml" : "";
+
+ public override string NamespaceURI => Prefix == "xml" ? base.NamespaceURI : "";
+
+ bool IsXmlNs => base.NamespaceURI == XmlNsNamespace;
+
+ bool IsLocalXmlNs => Prefix == "xmlns";
+}
+
diff --git a/src/Web/SkipElementsReader.cs b/src/Web/SkipElementsReader.cs
new file mode 100644
index 0000000..bdbb029
--- /dev/null
+++ b/src/Web/SkipElementsReader.cs
@@ -0,0 +1,27 @@
+using System;
+using System.Collections.Generic;
+using System.Xml;
+
+namespace Devlooped.Web;
+
+///
+/// Ignores specific elements from the input XML.
+///
+class SkipElementsReader : XmlWrappingReader
+{
+ readonly HashSet skipElements;
+
+ public SkipElementsReader(XmlReader baseReader, string[] skipElements) : base(baseReader)
+ {
+ this.skipElements = new HashSet(skipElements, StringComparer.OrdinalIgnoreCase);
+ }
+
+ public override bool Read()
+ {
+ var read = base.Read();
+ if (read && base.NodeType == XmlNodeType.Element && skipElements.Contains(LocalName))
+ base.Skip();
+
+ return read;
+ }
+}
diff --git a/src/Web/Web.csproj b/src/Web/Web.csproj
index 302c892..b69d996 100644
--- a/src/Web/Web.csproj
+++ b/src/Web/Web.csproj
@@ -20,7 +20,6 @@
-
\ No newline at end of file
diff --git a/src/Web/XmlExtensions.cs b/src/Web/XmlExtensions.cs
new file mode 100644
index 0000000..dd6da5a
--- /dev/null
+++ b/src/Web/XmlExtensions.cs
@@ -0,0 +1,48 @@
+using System.ComponentModel;
+using Devlooped.Web;
+
+namespace System.Xml
+{
+ ///
+ /// Extension methods for .
+ ///
+ [EditorBrowsable(EditorBrowsableState.Never)]
+ public static class XmlReaderExtensions
+ {
+ ///
+ /// Creates a wrapping reader that ignores all XML namespace declarations,
+ /// so that all resulting elements and attributes have no namespaces.
+ ///
+ public static XmlReader IgnoreNamespaces(this XmlReader reader)
+ => new IgnoreXmlNsReader(reader);
+
+ ///
+ /// Creates a wrapping reader that skips elements (and their child nodes) with
+ /// the given local names (without namespace, if any).
+ ///
+ public static XmlReader SkipElements(this XmlReader reader, params string[] localNames)
+ => new SkipElementsReader(reader, localNames);
+ }
+}
+
+namespace System.Xml.Linq
+{
+ ///
+ /// Extension methods for .
+ ///
+ [EditorBrowsable(EditorBrowsableState.Never)]
+ public static class XElementExtensions
+ {
+ ///
+ /// Returns a clone of the node, with XML namespaces removed.
+ ///
+ public static XElement RemoveNamespaces(this XElement element)
+ => XElement.Load(element.CreateReader().IgnoreNamespaces());
+
+ ///
+ /// Returns a clone of the root node, with XML namespaces removed.
+ ///
+ public static XElement? RemoveNamespaces(this XDocument document)
+ => document.Root == null ? null : XElement.Load(document.Root.CreateReader().IgnoreNamespaces());
+ }
+}
\ No newline at end of file
diff --git a/src/Web/readme.md b/src/Web/readme.md
new file mode 100644
index 0000000..8104118
--- /dev/null
+++ b/src/Web/readme.md
@@ -0,0 +1,2 @@
+
+
\ No newline at end of file