Skip to content

Commit

Permalink
fix(parser): parse HTML named character references
Browse files Browse the repository at this point in the history
- Use @fchasen’s fork of xmldom to parse the HTML named character
  references defined in HTML, even when the document is XHTML.
  Note however that this is a willful violation of the HTML standard,
  since the entities are only declared when the document has one of the
  allowed public identifiers
  (see https://html.spec.whatwg.org/#parsing-xhtml-documents)
- Set an error handler to xmldom’s `DOMParser` to catch parsing errors
  (like undeclared entities) and log them with winston.
- Add tests.

Fixes #182
  • Loading branch information
rdeltour committed May 25, 2018
1 parent 7ad1373 commit 1e83bf7
Show file tree
Hide file tree
Showing 10 changed files with 92 additions and 11 deletions.
2 changes: 1 addition & 1 deletion packages/epub-utils/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"fs-extra": "^6.0.1",
"tmp": "^0.0.33",
"winston": "^2.4.0",
"xmldom": "^0.1.27",
"xmldom-alpha": "https://github.com/fchasen/xmldom.git#a38f7ddb536ab74e9fb549477ba9f9b7ea2d0beb",
"xpath": "^0.0.24"
},
"publishConfig": {
Expand Down
19 changes: 13 additions & 6 deletions packages/epub-utils/src/epub-parse.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,20 @@

'use strict';

const DOMParser = require('xmldom').DOMParser;
const XMLSerializer = require('xmldom').XMLSerializer;
const DOMParser = require('xmldom-alpha').DOMParser;
const XMLSerializer = require('xmldom-alpha').XMLSerializer;
const fs = require('fs');
const path = require('path');
const xpath = require('xpath');
const winston = require('winston');

// Error Handler for DOMParser instances
const errorHandler = {
warning: w => winston.warn(w),
error: e => winston.warn(e),
fatalError: fe => winston.error(fe),
}

function SpineItem() {
this.filepath = "";
this.relpath = "";
Expand All @@ -33,7 +40,7 @@ function EpubParser() {

function parseNavDoc(fullpath, epubDir) {
const content = fs.readFileSync(fullpath).toString();
const doc = new DOMParser().parseFromString(content);
const doc = new DOMParser({errorHandler}).parseFromString(content, 'application/xhtml+xml');

// Remove all links
const aElems = doc.getElementsByTagNameNS('http://www.w3.org/1999/xhtml', 'a');
Expand Down Expand Up @@ -127,7 +134,7 @@ EpubParser.prototype.parse = function(epubDir) {

EpubParser.prototype.parseData = function(packageDocPath, epubDir) {
const content = fs.readFileSync(packageDocPath).toString();
const doc = new DOMParser().parseFromString(content);
const doc = new DOMParser({errorHandler}).parseFromString(content);
const select = xpath.useNamespaces(
{ opf: 'http://www.idpf.org/2007/opf',
dc: 'http://purl.org/dc/elements/1.1/'});
Expand Down Expand Up @@ -168,7 +175,7 @@ EpubParser.prototype.parseData = function(packageDocPath, epubDir) {

EpubParser.prototype.parseContentDocTitle = function(filepath) {
const content = fs.readFileSync(filepath).toString();
const doc = new DOMParser().parseFromString(content);
const doc = new DOMParser({errorHandler}).parseFromString(content, 'application/xhtml+xml');
const select = xpath.useNamespaces({html: "http://www.w3.org/1999/xhtml", epub: "http://www.idpf.org/2007/ops"});
const title = select('//html:title/text()', doc);
if (title.length > 0) {
Expand All @@ -182,7 +189,7 @@ EpubParser.prototype.parseContentDocTitle = function(filepath) {
EpubParser.prototype.calculatePackageDocPath = function(epubDir) {
const containerFilePath = `${epubDir}/META-INF/container.xml`;
const content = fs.readFileSync(containerFilePath).toString();
const doc = new DOMParser().parseFromString(content);
const doc = new DOMParser({errorHandler}).parseFromString(content);
const select = xpath.useNamespaces({ ocf: 'urn:oasis:names:tc:opendocument:xmlns:container' });
const rootfiles = select('//ocf:rootfile[@media-type="application/oebps-package+xml"]/@full-path', doc);
// just grab the first one as we're not handling the case of multiple renditions
Expand Down
12 changes: 11 additions & 1 deletion tests/__tests__/cli.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,17 @@ describe('Running the CLI', () => {
const log = stripAnsi(stdout);
expect(/^warn:\s+The SVG Content Documents in this EPUB will be ignored\./m.test(log)).toBe(true);
});
});
});

describe('does not raise a warning', () => {
test('when a named character reference is used in XHTML', () => {
const { stdout, stderr, status } = ace(['issue-182'], {
cwd: path.resolve(__dirname, '../data'),
});
const log = stripAnsi(stdout);
expect(/^warn:\s+\[xmldom error\] entity not found/m.test(log)).toBe(false);
});
});

/*test('with return-2-on-validation-error set to true should exit with return code 2', () => {
// TODO this test won't work until we can specify the CLI option to enable returning 2 on violation(s)
Expand Down
12 changes: 12 additions & 0 deletions tests/__tests__/regression.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,15 @@ test('issue #170: heading with `doc-subtitle` role were reported empty', async (
const report = await ace('../data/issue-170');
expect(report['earl:result']['earl:outcome']).toEqual('pass');
});

test('issue #182: named character references are parsed', async () => {
const report = await ace('../data/issue-182');
expect(report.assertions).toEqual(expect.arrayContaining([
expect.objectContaining({
"earl:testSubject": {
"url": "content_001.xhtml",
"dct:title": "Minimal – EPUB"
}
})
]));
});
10 changes: 10 additions & 0 deletions tests/data/issue-182/EPUB/content_001.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "">
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
<title>Minimal &ndash; EPUB</title>
</head>
<body>
<h1>Loomings</h1>
<p>Call me Ishmael.</p>
</body>
</html>
12 changes: 12 additions & 0 deletions tests/data/issue-182/EPUB/nav.xhtml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops" xml:lang="en">
<head>
<title>Minimal Nav</title>
</head>
<body>
<nav epub:type="toc">
<ol>
<li><a href="content_001.xhtml">content 001</a></li>
</ol>
</nav>
</body>
</html>
23 changes: 23 additions & 0 deletions tests/data/issue-182/EPUB/package.opf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0" encoding="UTF-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" xml:lang="en" unique-identifier="uid">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:title id="title">Minimal EPUB 3.0</dc:title>
<dc:language>en</dc:language>
<dc:identifier id="uid">NOID</dc:identifier>
<meta property="dcterms:modified">2017-01-01T00:00:01Z</meta>
<meta property="schema:accessibilityFeature">structuralNavigation</meta>
<meta property="schema:accessibilitySummary">everything OK!</meta>
<meta property="schema:accessibilityHazard">noFlashingHazard</meta>
<meta property="schema:accessibilityHazard">noSoundHazard</meta>
<meta property="schema:accessibilityHazard">noMotionSimulationHazard</meta>
<meta property="schema:accessMode">textual</meta>
<meta property="schema:accessModeSufficient">textual</meta>
</metadata>
<manifest>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="content_001" href="content_001.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="content_001" />
</spine>
</package>
6 changes: 6 additions & 0 deletions tests/data/issue-182/META-INF/container.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8" ?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="EPUB/package.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
1 change: 1 addition & 0 deletions tests/data/issue-182/mimetype
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
application/epub+zip
6 changes: 3 additions & 3 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -6080,9 +6080,9 @@ xmlbuilder@~9.0.1:
version "9.0.7"
resolved "https://registry.yarnpkg.com/xmlbuilder/-/xmlbuilder-9.0.7.tgz#132ee63d2ec5565c557e20f4c22df9aca686b10d"

xmldom@^0.1.27:
version "0.1.27"
resolved "https://registry.yarnpkg.com/xmldom/-/xmldom-0.1.27.tgz#d501f97b3bdb403af8ef9ecc20573187aadac0e9"
"xmldom-alpha@https://github.com/fchasen/xmldom.git#a38f7ddb536ab74e9fb549477ba9f9b7ea2d0beb":
version "0.1.28"
resolved "https://github.com/fchasen/xmldom.git#a38f7ddb536ab74e9fb549477ba9f9b7ea2d0beb"

xpath@^0.0.24:
version "0.0.24"
Expand Down

0 comments on commit 1e83bf7

Please sign in to comment.