Skip to content

Commit

Permalink
feat: extract more info from the HTML content
Browse files Browse the repository at this point in the history
Report the following elements in the `data` section of the report:
- `audio`
- `video`
- `object`
- `script`
- `iframe`
- `embed`
- `object`
- `canvas`
- `map`

Add a new `properties` section to the report to list boolean statements
on the document:
- `hasMathML`
- `hasPageBreaks`

Add units and integration tests.

Closes #26.
  • Loading branch information
rdeltour committed Oct 4, 2017
1 parent 3aa5298 commit 6134f6d
Show file tree
Hide file tree
Showing 36 changed files with 924 additions and 40 deletions.
1 change: 1 addition & 0 deletions .eslintrc.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ module.exports = {
},
"rules": {
"no-console": 0,
"no-param-reassign": ["error", { "props": false }],
"no-underscore-dangle": ["error", { "allowAfterThis": true }],
},
"plugins": [
Expand Down
23 changes: 16 additions & 7 deletions src/checker/checker-nightmare.js
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,22 @@ function checkSingle(spineItem, epub, nightmare) {
(results.assertions == null)
? 'No'
: results.assertions.assertions.length} issues found`);
if (results.data != null && results.data.images != null) {
results.data.images.forEach((img) => {
const imageFullPath = path.resolve(path.dirname(spineItem.filepath), img.path);
const imageRelPath = path.relative(epub.basedir, imageFullPath);
img.filepath = imageFullPath;
img.path = imageRelPath;
img.location = `${spineItem.relpath}#epubcfi(${img.cfi})`;
// Resolve path and locators for extracted data
if (results.data != null) {
Object.getOwnPropertyNames(results.data).forEach((key) => {
if (!Array.isArray(results.data[key])) return;
results.data[key].forEach((item) => {
if (item.src !== undefined) {
const fullpath = path.resolve(path.dirname(spineItem.filepath), item.src);
const relpath = path.relative(epub.basedir, fullpath);
item.path = fullpath;
item.src = relpath;
if (item.cfi !== undefined) {
item.location = `${spineItem.relpath}#epubcfi(${item.cfi})`;
delete item.cfi;
}
}
});
});
}
return results;
Expand Down
19 changes: 8 additions & 11 deletions src/checker/checker.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
const checker = require('./checker-nightmare.js');
const winston = require('winston');

function finalize(results, report) {
// Copy assertions
results
.filter(res => res.assertions != null)
.forEach(res => report.addContentDocAssertion(res.assertions));
function consolidate(results, report) {
// Integrate checker results to the report
results.forEach((res) => {
report.addContentDocAssertion(res.assertions);
report.addProperties(res.properties);
report.addData(res.data);
});
// Get a flat array of all the headings in the documents
const headings = []
.concat(...results.map(docResult => docResult.outlines.headings))
Expand All @@ -16,16 +18,11 @@ function finalize(results, report) {
// Aggregated array of the HTML outlines
const htmlOutlines = results.map(docResult => docResult.outlines.html);
report.addHTMLOutlines(htmlOutlines);
// Get a flat array of the extracted images
const images = []
.concat(...results.map(docResult => docResult.data.images))
.filter(e => e !== undefined);
report.addImages(images);
return report;
}

module.exports.check = function check(epub, report) {
winston.info('Checking documents...');
return checker.check(epub)
.then(results => finalize(results, report));
.then(results => consolidate(results, report));
};
11 changes: 7 additions & 4 deletions src/report/report-builders.js
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,13 @@ class ReportBuilder {
withAssertion(this._json, assertions);
return this;
}
withData(data) {
Object.getOwnPropertyNames(data).forEach((key) => {
this._json.data[key] = (Array.isArray(this._json.data[key]))
? this._json.data[key].push(data[key])
: data[key];
});
}
withEPUBOutline(outline) {
this._json.outlines.toc = outline;
return this;
Expand All @@ -120,10 +127,6 @@ class ReportBuilder {
this._json.outlines.html = outline;
return this;
}
withImages(images) {
this._json.data.images = images;
return this;
}
withProperties(properties) {
Object.getOwnPropertyNames(properties).forEach((key) => {
this._json.properties[key] = (key in properties)
Expand Down
13 changes: 6 additions & 7 deletions src/report/report.js
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ module.exports = class Report {
this._builder.withAssertion(assertion);
return this;
}
addData(data) {
this._builder.withData(data);
}
addOutline(outline) {
this._builder.withHOutline(outline);
return this;
Expand All @@ -69,10 +72,6 @@ module.exports = class Report {
this._builder.withEPUBOutline(navDoc.tocHTML);
return this;
}
addImages(images) {
this._builder.withImages(images);
return this;
}
addProperties(properties) {
this._builder.withProperties(properties);
return this;
Expand All @@ -87,9 +86,9 @@ module.exports = class Report {
.then(() => {
if (this.json.data.images != null) {
this.json.data.images.forEach((img) => {
const fromPath = img.filepath;
const toPath = path.join(outdir, 'data', img.path);
delete img.filepath;
const fromPath = img.path;
const toPath = path.join(outdir, 'data', img.src);
delete img.path;
return fs.copy(fromPath, toPath, {
overwrite: false,
});
Expand Down
181 changes: 179 additions & 2 deletions src/scripts/ace-extraction.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,89 @@ window.daisy = window.daisy || {};
ace = daisy.ace = daisy.ace || {};

ace.createReport = function(report) {
let reportData = function(field, array) {
if (array.length > 0) report.data[field] = array;
}
report.outlines = report.outlines || {};
report.outlines.html = ace.getHTMLOutline();
report.outlines.headings = ace.getHeadings();
report.data = report.data || {};
report.data.images = ace.getImages();
reportData('images', ace.getImages());
reportData('audios', ace.getAudios());
reportData('canvases', ace.getCanvases());
reportData('embeds', ace.getEmbeds());
reportData('iframes', ace.getIframes());
reportData('maps', ace.getMaps());
reportData('scripts`', ace.getScripts());
reportData('videos', ace.getVideos());
report.properties = report.properties || {};
report.properties.hasMathML = ace.hasMathML();
report.properties.hasPageBreaks = ace.hasPageBreaks();
};

ace.getAudios = function() {
let audioElems = document.querySelectorAll('audio');
let audios = [];
audioElems.forEach(function(audio) {
let audioObj = {
controls: audio.hasAttribute('controls'),
cfi: window.daisy.epub.createCFI(audio),
html: audio.outerHTML,
tracks: [],
}
if (audio.hasAttribute('id')) audioObj.id = audio.getAttribute('id');
if (audio.hasAttribute('src')) {
audioObj.src = audio.getAttribute('src');
} else {
audioObj.src = [];
audio.querySelectorAll('source').forEach(function(source) {
audioObj.src.push({
src: source.getAttribute('src'),
type: source.getAttribute('type'),
});
});
}
audio.querySelectorAll('track').forEach(function(track) {
let trackObj = {};
if (track.hasAttribute('label')) trackObj.label = track.getAttribute('label');
if (track.hasAttribute('kind')) trackObj.kind = track.getAttribute('kind');
if (track.hasAttribute('src')) trackObj.src = track.getAttribute('src');
if (track.hasAttribute('srclang')) trackObj.srclang = track.getAttribute('srclang');
audioObj.tracks.push(trackObj);
});
audios.push(audioObj);
});
return audios;
}

ace.getCanvases = function() {
let canvasElems = document.querySelectorAll('canvas');
let canvass = [];
canvasElems.forEach(function(canvas) {
let canvasObj = {
cfi: window.daisy.epub.createCFI(canvas),
}
if (canvas.hasAttribute('id')) canvasObj.id = canvas.getAttribute('id');
canvass.push(canvasObj);
});
return canvass;
}

ace.getEmbeds = function() {
let embedElems = document.querySelectorAll('embed');
let embeds = [];
embedElems.forEach(function(embed) {
let embedObj = {
cfi: window.daisy.epub.createCFI(embed),
}
if (embed.hasAttribute('id')) embedObj.id = embed.getAttribute('id');
if (embed.hasAttribute('src')) embedObj.src = embed.getAttribute('src');
if (embed.hasAttribute('type')) embedObj.type = embed.getAttribute('type');
embeds.push(embedObj);
});
return embeds;
}

ace.getHTMLOutline = function() {
return HTML5Outline(document.body).asHTML();
}
Expand All @@ -34,6 +110,20 @@ ace.getHeadings = function() {
return headings;
}

ace.getIframes = function() {
let iframeElems = document.querySelectorAll('iframe');
let iframes = [];
iframeElems.forEach(function(iframe) {
let iframeObj = {
cfi: window.daisy.epub.createCFI(iframe),
}
if (iframe.hasAttribute('id')) iframeObj.id = iframe.getAttribute('id');
if (iframe.hasAttribute('src')) iframeObj.src = iframe.getAttribute('src');
iframes.push(iframeObj);
});
return iframes;
}

ace.getImages = function() {
var findFigure = function(el) {
while ((el = el.parentElement) && !(el.localName === 'figure'));
Expand All @@ -44,7 +134,7 @@ ace.getImages = function() {
let images = [];
imgElems.forEach(function(img) {
let imageObj = {
path: img.getAttribute('src'),
src: img.getAttribute('src'),
alt: img.getAttribute('alt'),
role: img.getAttribute('role'),
cfi: window.daisy.epub.createCFI(img),
Expand All @@ -64,3 +154,90 @@ ace.getImages = function() {
});
return images;
}

ace.getMaps = function() {
let mapElems = document.querySelectorAll('map');
let maps = [];
mapElems.forEach(function(map) {
let mapObj = {
cfi: window.daisy.epub.createCFI(map),
name: map.getAttribute('name'),
}
if (map.hasAttribute('id')) mapObj.id = map.getAttribute('id');
maps.push(mapObj);
});
return maps;
}

ace.getObjects = function() {
let objectElems = document.querySelectorAll('object');
let objects = [];
objectElems.forEach(function(object) {
let objectObj = {
cfi: window.daisy.epub.createCFI(object),
}
if (object.hasAttribute('id')) objectObj.id = object.getAttribute('id');
if (object.hasAttribute('data')) objectObj.data = object.getAttribute('data');
if (object.hasAttribute('type')) objectObj.type = object.getAttribute('type');
objects.push(objectObj);
});
return objects;
}

ace.getScripts = function() {
let scriptElems = document.querySelectorAll('script');
let scripts = [];
scriptElems.forEach(function(script) {
let scriptObj = {
cfi: window.daisy.epub.createCFI(script),
}
scriptObj.type = (script.hasAttribute('type'))?script.getAttribute('type'):'text/javascript';
if (script.hasAttribute('id')) scriptObj.id = script.getAttribute('id');
if (script.hasAttribute('src')) scriptObj.src = script.getAttribute('src');
scripts.push(scriptObj);
});
return scripts;
}

ace.getVideos = function() {
let videoElems = document.querySelectorAll('video');
let videos = [];
videoElems.forEach(function(video) {
let videoObj = {
controls: video.hasAttribute('controls'),
cfi: window.daisy.epub.createCFI(video),
html: video.outerHTML,
tracks: [],
}
if (video.hasAttribute('id')) videoObj.id = video.getAttribute('id');
if (video.hasAttribute('src')) {
videoObj.src = video.getAttribute('src');
} else {
videoObj.src = [];
video.querySelectorAll('source').forEach(function(source) {
videoObj.src.push({
src: source.getAttribute('src'),
type: source.getAttribute('type'),
});
});
}
video.querySelectorAll('track').forEach(function(track) {
let trackObj = {};
if (track.hasAttribute('label')) trackObj.label = track.getAttribute('label');
if (track.hasAttribute('kind')) trackObj.kind = track.getAttribute('kind');
if (track.hasAttribute('src')) trackObj.src = track.getAttribute('src');
if (track.hasAttribute('srclang')) trackObj.srclang = track.getAttribute('srclang');
videoObj.tracks.push(trackObj);
});
videos.push(videoObj);
});
return videos;
}

ace.hasMathML = function() {
return document.querySelectorAll('math').length > 0;
}

ace.hasPageBreaks = function() {
return document.querySelectorAll('[epub\\:type~="pagebreak"], [role~="doc-pagebreak"]').length > 0;
}
Loading

0 comments on commit 6134f6d

Please sign in to comment.