Skip to content

Commit

Permalink
Add XML namespace removal and element skipping extensions
Browse files Browse the repository at this point in the history
Fixes #64
  • Loading branch information
kzu committed Sep 3, 2022
1 parent fdfc55c commit 871278b
Show file tree
Hide file tree
Showing 9 changed files with 436 additions and 80 deletions.
27 changes: 27 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
[![Downloads](https://img.shields.io/nuget/dt/Devlooped.Web.svg?color=green)](https://www.nuget.org/packages/Devlooped.Web)
[![License](https://img.shields.io/github/license/devlooped/web.svg?color=blue)](https://github.com/devlooped/web/blob/main/license.txt)

<!-- #content -->
Read HTML as XML and query it with CSS over XLinq.

No need to learn an entirely new object model for a page 🤘.
Expand Down Expand Up @@ -32,6 +33,29 @@ receiving an `HtmlReaderSettings`.
The underlying parsing is performed by the amazing [SgmlReader](https://www.nuget.org/packages/Microsoft.Xml.SgmlReader)
library by Microsoft's [Chris Lovett](http://lovettsoftware.com/).

In addition, the following extension methods make it easier to work
with XML documents where you want to query with CSS or XPath without
having to deal with XML namespaces:

```csharp
using System.Xml;
using System.Xml.Linq;
using Devlooped.Web;

var doc = XDocument.Load("doc.xml")
// Will remove all xmlns declarations, and allow querying elements
// as if none had namespaces, returns the root element
XElement nons = doc.RemoveNamespaces();

// Alternatively, you can also ignore at the XmlReader level
using var reader = XmlReader.Create("doc.xml").IgnoreNamespaces();
doc = XDocument.Load(reader);

// Finally, you can also skip elements at the reader level
using var reader = XmlReader.Create("doc.xml").SkipElements("foo", "bar");
doc = XDocument.Load(reader);
```

## CSS

At the moment, supports the following CSS selector features:
Expand Down Expand Up @@ -76,6 +100,8 @@ Non-CSS features:
* `[text()*=val]`: Represents an element whose text contents contains at least one instance of the
substring "val". If "val" is the empty string then the selector does not represent anything.

<!-- #content -->

# Dogfooding

[![CI Version](https://img.shields.io/endpoint?url=https://shields.kzu.io/vpre/Devlooped.Web/main&label=nuget.ci&color=brightgreen)](https://pkg.kzu.io/index.json)
Expand All @@ -91,6 +117,7 @@ The versioning scheme for packages is:
- Branch builds: *42.42.42-*`[BRANCH]`.`[COMMITS]`


<!-- #sponsors -->
<!-- include docs/footer.md -->
# Sponsors

Expand Down
43 changes: 43 additions & 0 deletions src/Tests/XmlTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
using System.Xml;
using System.Xml.Linq;
using System.Xml.XPath;

namespace Tests;

public record XmlTests(ITestOutputHelper Output)
{
[Fact]
public void RemoveNamespacesFromElement()
{
var doc = XDocument.Load("package.opf");
var nons = doc.Root!.RemoveNamespaces();

var xmlns = new XmlNamespaceManager(new NameTable());
xmlns.AddNamespace("opf", "http://www.idpf.org/2007/opf");
xmlns.AddNamespace("dc", "http://purl.org/dc/elements/1.1/");

var yearns = doc.XPathSelectElement("/opf:package/opf:metadata/opf:meta[@property='dcterms:date']", xmlns)?.Value;

// NOTE: since we're at the element level now, we don't need to reference the root element
var year = nons.XPathSelectElement("/metadata/meta[@property='dcterms:date']")?.Value;

//Output.WriteLine(doc.Root!.Elements().First().ToString());
//Output.WriteLine(nons.Elements().First().ToString());

Assert.NotNull(yearns);
Assert.NotNull(year);

Assert.Equal(yearns, year);
}

[Fact]
public void RemoveElementsFromReader()
{
using var reader = XmlReader.Create("package.opf").SkipElements("manifest");
var doc = XDocument.Load(reader);

var all = XDocument.Load("package.opf");

Assert.NotEqual(doc.Root!.Elements().Count(), all.Root!.Elements().Count());
}
}
229 changes: 229 additions & 0 deletions src/Tests/package.opf

Large diffs are not rendered by default.

79 changes: 0 additions & 79 deletions src/Web/HtmlDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -197,84 +197,5 @@ static XmlReader Configure(SgmlReader reader, HtmlReaderSettings settings)

return result;
}

/// <summary>
/// Removes all XML namespaces, since for HTML content it's typically
/// irrelevant.
/// </summary>
class IgnoreXmlNsReader : XmlWrappingReader
{
const string XmlNsNamespace = "http://www.w3.org/2000/xmlns/";

public IgnoreXmlNsReader(XmlReader baseReader) : base(baseReader) { }

public override int AttributeCount
{
get
{
var count = 0;
for (var go = MoveToFirstAttribute(); go; go = MoveToNextAttribute())
count++;

return count;
}
}

public override bool MoveToFirstAttribute()
{
var moved = base.MoveToFirstAttribute();
while (moved && (IsXmlNs || IsLocalXmlNs))
moved = MoveToNextAttribute();

if (!moved)
base.MoveToElement();

return moved;
}

public override bool MoveToNextAttribute()
{
var moved = base.MoveToNextAttribute();
while (moved && (IsXmlNs || IsLocalXmlNs))
moved = MoveToNextAttribute();

return moved;
}

/// <summary>
/// We only support the <c>xml</c> prefix, used for <c>xml:lang</c> and <c>xml:space</c>
/// built-in text handling in XHTML.
/// </summary>
public override string Prefix => base.Prefix == "xml" ? "xml" : "";

public override string NamespaceURI => Prefix == "xml" ? base.NamespaceURI : "";

bool IsXmlNs => base.NamespaceURI == XmlNsNamespace;

bool IsLocalXmlNs => Prefix == "xmlns";
}

/// <summary>
/// Removes all XML namespaces, since for HTML content it's typically
/// irrelevant.
/// </summary>
class SkipElementsReader : XmlWrappingReader
{
readonly HashSet<string> skipElements;

public SkipElementsReader(XmlReader baseReader, string[] skipElements) : base(baseReader)
{
this.skipElements = new HashSet<string>(skipElements, StringComparer.OrdinalIgnoreCase);
}

public override bool Read()
{
var read = base.Read();
if (read && base.NodeType == XmlNodeType.Element && skipElements.Contains(LocalName))
base.Skip();

return read;
}
}
}

60 changes: 60 additions & 0 deletions src/Web/IgnoreXmlNsReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
using System.Xml;

namespace Devlooped.Web;

/// <summary>
/// Removes all XML namespaces, since for HTML content it's typically
/// irrelevant.
/// </summary>
class IgnoreXmlNsReader : XmlWrappingReader
{
const string XmlNsNamespace = "http://www.w3.org/2000/xmlns/";

public IgnoreXmlNsReader(XmlReader baseReader) : base(baseReader) { }

public override int AttributeCount
{
get
{
var count = 0;
for (var go = MoveToFirstAttribute(); go; go = MoveToNextAttribute())
count++;

return count;
}
}

public override bool MoveToFirstAttribute()
{
var moved = base.MoveToFirstAttribute();
while (moved && (IsXmlNs || IsLocalXmlNs))
moved = MoveToNextAttribute();

if (!moved)
base.MoveToElement();

return moved;
}

public override bool MoveToNextAttribute()
{
var moved = base.MoveToNextAttribute();
while (moved && (IsXmlNs || IsLocalXmlNs))
moved = MoveToNextAttribute();

return moved;
}

/// <summary>
/// We only support the <c>xml</c> prefix, used for <c>xml:lang</c> and <c>xml:space</c>
/// built-in text handling in XHTML.
/// </summary>
public override string Prefix => base.Prefix == "xml" ? "xml" : "";

public override string NamespaceURI => Prefix == "xml" ? base.NamespaceURI : "";

bool IsXmlNs => base.NamespaceURI == XmlNsNamespace;

bool IsLocalXmlNs => Prefix == "xmlns";
}

27 changes: 27 additions & 0 deletions src/Web/SkipElementsReader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using System;
using System.Collections.Generic;
using System.Xml;

namespace Devlooped.Web;

/// <summary>
/// Ignores specific elements from the input XML.
/// </summary>
class SkipElementsReader : XmlWrappingReader
{
readonly HashSet<string> skipElements;

public SkipElementsReader(XmlReader baseReader, string[] skipElements) : base(baseReader)
{
this.skipElements = new HashSet<string>(skipElements, StringComparer.OrdinalIgnoreCase);
}

public override bool Read()
{
var read = base.Read();
if (read && base.NodeType == XmlNodeType.Element && skipElements.Contains(LocalName))
base.Skip();

return read;
}
}
1 change: 0 additions & 1 deletion src/Web/Web.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

<ItemGroup>
<InternalsVisibleTo Include="Devlooped.Tests" />
<None Include="..\..\readme.md" PackagePath="readme.md" />
</ItemGroup>

</Project>
48 changes: 48 additions & 0 deletions src/Web/XmlExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
using System.ComponentModel;
using Devlooped.Web;

namespace System.Xml
{
/// <summary>
/// Extension methods for <see cref="XmlReader"/>.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
public static class XmlReaderExtensions
{
/// <summary>
/// Creates a wrapping reader that ignores all XML namespace declarations,
/// so that all resulting elements and attributes have no namespaces.
/// </summary>
public static XmlReader IgnoreNamespaces(this XmlReader reader)
=> new IgnoreXmlNsReader(reader);

/// <summary>
/// Creates a wrapping reader that skips elements (and their child nodes) with
/// the given local names (without namespace, if any).
/// </summary>
public static XmlReader SkipElements(this XmlReader reader, params string[] localNames)
=> new SkipElementsReader(reader, localNames);
}
}

namespace System.Xml.Linq
{
/// <summary>
/// Extension methods for <see cref="XElement"/>.
/// </summary>
[EditorBrowsable(EditorBrowsableState.Never)]
public static class XElementExtensions
{
/// <summary>
/// Returns a clone of the node, with XML namespaces removed.
/// </summary>
public static XElement RemoveNamespaces(this XElement element)
=> XElement.Load(element.CreateReader().IgnoreNamespaces());

/// <summary>
/// Returns a clone of the root node, with XML namespaces removed.
/// </summary>
public static XElement? RemoveNamespaces(this XDocument document)
=> document.Root == null ? null : XElement.Load(document.Root.CreateReader().IgnoreNamespaces());
}
}
2 changes: 2 additions & 0 deletions src/Web/readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
<!-- include ..\..\readme.md#content -->
<!-- include ..\..\readme.md#sponsors -->

0 comments on commit 871278b

Please sign in to comment.