diff --git a/packages/epub-utils/src/epub.js b/packages/epub-utils/src/epub.js index 0613ec1e..21592326 100644 --- a/packages/epub-utils/src/epub.js +++ b/packages/epub-utils/src/epub.js @@ -1,7 +1,7 @@ 'use strict'; const epubParse = require('./epub-parse.js'); -const unzip = require('extract-zip'); +const extractZip = require('extract-zip'); const tmp = require('tmp'); const fs = require('fs-extra'); const path = require('path'); @@ -9,6 +9,42 @@ const winston = require('winston'); tmp.setGracefulCleanup(); +async function unzip(path) { + const tmpdir = tmp.dirSync({ unsafeCleanup: true }).name; + return new Promise((resolve, reject) => { + extractZip(path, { dir: tmpdir }, (err) => { + if (err) { + reject(err); + } else { + resolve(tmpdir); + } + }); + }) +} + +async function retryUnzip(epub, error) { + if (error.message === undefined) throw error; + winston.info('Trying to repair the archive and unzip again...'); + try { + // Detect 'invalid comment length' errors + const invalidCommentLengthMatch = error.message.match(/invalid comment length\. expected: (\d+)\. found: (\d)/); + if (invalidCommentLengthMatch) { + const tmpEPUB = tmp.fileSync({ unsafeCleanup: true }).name; + const size = fs.statSync(epub.path).size; + const truncatedSize = size - invalidCommentLengthMatch[1]; + fs.copySync(epub.path, tmpEPUB); + fs.truncateSync(tmpEPUB, truncatedSize); + return await unzip(tmpEPUB); + } else { + winston.error('The ZIP archive couldn’t be repaired.'); + } + } catch (error) { + winston.error('Unzipping failed again'); + winston.debug(error); + } + throw error; +} + class EPUB { constructor(epub, cwd = process.cwd()) { this.path = path.resolve(cwd, epub); @@ -22,30 +58,33 @@ class EPUB { return fs.statSync(this.path).isDirectory(); } - extract() { - return new Promise((resolve, reject) => { - if (this.basedir !== undefined) { - resolve(this); - } else if (this.expanded) { - winston.verbose('EPUB is already unpacked'); - this.basedir = this.path; - resolve(this); - } else { - winston.verbose('Extracting EPUB'); - const tmpdir = tmp.dirSync({ unsafeCleanup: true }); // remove even when not empty - unzip(this.path, { dir: tmpdir.name }, (err) => { - if (err) { - winston.error('Failed to unzip EPUB (the ZIP archive may be corrupt).'); - reject(err); - } else { - this.basedir = tmpdir.name; - resolve(this); - } - }); + async extract() { + if (this.basedir !== undefined) { + return this; + } else if (this.expanded) { + winston.verbose('EPUB is already unpacked'); + this.basedir = this.path; + return this; + } else { + winston.verbose('Extracting EPUB'); + let unzippedDir; + try { + unzippedDir = await unzip(this.path); + } catch (error) { + winston.error('Failed to unzip EPUB (the ZIP archive may be corrupt).'); + winston.debug(error); + try { + unzippedDir = await retryUnzip(this, error); + } catch (error) { + throw error; + } } - }); + this.basedir = unzippedDir; + return this; + } } + parse() { return new Promise((resolve, reject) => { if (this.parsed) return resolve(this); diff --git a/tests/__tests__/unzip.test.js b/tests/__tests__/unzip.test.js new file mode 100644 index 00000000..311620ab --- /dev/null +++ b/tests/__tests__/unzip.test.js @@ -0,0 +1,52 @@ +'use strict'; + +const fs = require('fs'); +const path = require('path'); +const tmp = require('tmp'); + +const runAce = require('../runAceJS'); + +tmp.setGracefulCleanup(); + +let outdir; +let tmpdir; +let reportPath; + +beforeEach(() => { + outdir = tmp.dirSync({ prefix: 'ace_out_', unsafeCleanup: true }); + tmpdir = tmp.dirSync({ prefix: 'ace_tmp_', unsafeCleanup: true }); + reportPath = path.join(outdir.name, 'report.json'); +}); + +afterEach(() => { + outdir.removeCallback(); + tmpdir.removeCallback(); +}); + + +function ace(epub, options = {}) { + return runAce(epub, Object.assign({ + outdir: outdir.name, + tmp: tmpdir.name, + }, options)) + .then(() => { + expect(fs.existsSync(reportPath)).toBeTruthy(); + return JSON.parse(fs.readFileSync(reportPath, 'utf8')); + }) + .catch(err => console.log(err)); +} + +test('well-formed EPUB archive is processed', async () => { + const report = await ace(path.join(__dirname, '../data/base-epub-30.epub')); + expect(report['earl:result']).toBeDefined(); +}); + +test('an EPUB archive with an extra comment length is repaired', async () => { +const report = await ace(path.join(__dirname, '../data/zip-invalid-comment-length.epub')); +expect(report['earl:result']).toBeDefined(); +}); + +test('an EPUB archive beyond repair is rejected', async () => { + const report = await ace(path.join(__dirname, '../data/zip-invalid.epub')); + expect(report).toBeUndefined(); +}); \ No newline at end of file diff --git a/tests/data/zip-invalid-comment-length.epub b/tests/data/zip-invalid-comment-length.epub new file mode 100644 index 00000000..ff6ffc80 Binary files /dev/null and b/tests/data/zip-invalid-comment-length.epub differ diff --git a/tests/data/zip-invalid.epub b/tests/data/zip-invalid.epub new file mode 100644 index 00000000..ce013625 --- /dev/null +++ b/tests/data/zip-invalid.epub @@ -0,0 +1 @@ +hello