From 2ec6f5bc843c6146e51068f077f211663ccbc867 Mon Sep 17 00:00:00 2001 From: Nathan Kennedy Date: Thu, 8 Apr 2021 14:48:02 +1200 Subject: [PATCH] Sitemap rewrite first commit (experimental) --- cli.js | 7 +- index.js | 6 ++ lib/processors/sitemap.js | 76 ++++++++++++++ lib/runner.js | 208 ++++++++++++++++++++++---------------- test/runner.test.js | 110 +++++++++++--------- test/xml.test.js | 114 +++++++++++++++++++++ 6 files changed, 384 insertions(+), 137 deletions(-) create mode 100644 lib/processors/sitemap.js create mode 100644 test/xml.test.js diff --git a/cli.js b/cli.js index 2633fcd..1501ea4 100755 --- a/cli.js +++ b/cli.js @@ -27,8 +27,9 @@ const commands = { 'clean': createCommand(runner.clean, ['dest']), 'clone-assets': createCommand(runner.clone_assets, ['baseurl', 'dest']), 'reseed': createCommand(runner.build, ['baseurl', 'dest']), - 'rewrite_css': createCommand(runner.rewrite_css, ['baseurl', 'dest']), + 'rewrite-css': createCommand(runner.rewrite_css, ['baseurl', 'dest']), 'rewrite-html': createCommand(runner.rewrite_html, ['baseurl', 'dest']), + 'rewrite-sitemap': createCommand(runner.rewrite_sitemap, ['baseurl', 'dest']), 'serve': createCommand(runner.buildAndServe, ['baseurl', 'dest']), 'watch': createCommand(runner.watch, ['baseurl', 'dest']) /* eslint-enable quote-props */ @@ -93,6 +94,7 @@ module.exports = { const source = flags.source || defaultSrc; const destination = flags.dest; const baseurl = flags.baseurl || ''; + const sitemap = flags.sitemap || 'sitemap.xml'; const port = this.checkPortNumber(flags.port) || defaultPort; const split = flags.split || 1; const partition = flags.partition || 1; @@ -104,7 +106,8 @@ module.exports = { paths: { src: source, dest: destination, - baseurl: baseurl + baseurl: baseurl, + sitemap: sitemap }, serve: { port: port, diff --git a/index.js b/index.js index c5c6fd8..475e6f5 100755 --- a/index.js +++ b/index.js @@ -11,6 +11,7 @@ Commands: clone-assets Clones non CSS and HTML files from src to dest rewrite-css Clones CSS files from src to dest and rewrites urls to include baseurl rewrite-html Clones HTML files from src to dest and rewrites attributes to include baseurl + rewrite-sitemap Finds all sitemap files based on index sitemap, and rewrites links to include baseurl serve Runs 'build' then a local webserver on the dest folder watch Watches the src folder and triggers builds @@ -23,6 +24,7 @@ Options: -b | --baseurl The base-URL to prepend to the files once copied -p | --port The portnumber to serve the cloned site on -e | --extrasrc A list of extra src attributes to be rewritten + -m | --sitemap A path to a valid sitemap file -o | --overwrite When cleaning --dest, don't prompt for confirmation --split The number of partitions to divide files into --partition The partition number to process @@ -57,6 +59,10 @@ const inputs = meow( alias: 'e', isMultiple: true }, + sitemap: { + type: 'string', + alias: 'm' + }, overwrite: { type: 'boolean', alias: 'o' diff --git a/lib/processors/sitemap.js b/lib/processors/sitemap.js new file mode 100644 index 0000000..e5e71c3 --- /dev/null +++ b/lib/processors/sitemap.js @@ -0,0 +1,76 @@ +const fs = require('fs-extra'); +const cheerio = require('cheerio'); +const log = require('fancy-log'); +const { URL } = require('url'); + +let extraPaths = []; + +function rewritePath(baseurl, link) { + try { + const url = new URL(link); + const newPath = `${baseurl}/${url.pathname}`.replace('//', '/'); + return `${url.origin}/${newPath}`; + } catch (urlError) { + return link; + } +} + +function rewriteXML(xml, baseurl) { + const $ = cheerio.load(xml, { + xmlMode: true + }); + + const rootNode = $.root().children()[0]; + const isIndex = rootNode && rootNode.name === 'sitemapindex'; + + $('loc').each(function () { + const $el = $(this); + const originalValue = $el.text(); + const updated = rewritePath(baseurl, originalValue); + $el.text(updated); + + if (isIndex && $el.parent()[0].name === 'sitemap') { + extraPaths.push(originalValue); + } + }); + + $('xhtml\\:link').each(function () { + const $el = $(this); + const originalValue = $el.attr('href'); + const updated = rewritePath(baseurl, originalValue); + $el.attr('href', updated); + }); + return $.xml(); +} + +module.exports = { + rewrite: rewriteXML, + + /** Handles rewriting urls in sitemap(s) + * + * @param {string} file the absolute path to the sitemap file. + * @param {string} destination the absolute path to the destination directory. + * @param {string} baseurl the baseurl to prepend to the source files. + */ + plugin: function (file, destination, baseurl) { + extraPaths = []; + if (!file) { + log.error('Error rewriting XML: Invalid file specified.'); + return 1; + } + if (!destination || !baseurl) { + log.error('Error rewriting XML: No destination specified.'); + return 1; + } + const contents = fs.readFileSync(file); + const xml = contents.toString('utf-8'); + + if (!xml) { + return 0; + } + const rewritten = rewriteXML(xml, baseurl); + + fs.writeFileSync(file, rewritten); + return extraPaths.length ? extraPaths : 0; + } +}; diff --git a/lib/runner.js b/lib/runner.js index 679275f..845baea 100644 --- a/lib/runner.js +++ b/lib/runner.js @@ -3,38 +3,52 @@ const fs = require('fs-extra'); const glob = require('glob'); const Path = require('path'); const readlineSync = require('readline-sync'); -const { promisify } = require('util'); const del = require('del'); const browserSync = require('browser-sync').create(); const chokidar = require('chokidar'); const log = require('fancy-log'); +const { URL } = require('url'); const cssRewrite = require('./processors/css').plugin; const htmlRewrite = require('./processors/html').plugin; +const sitemapRewrite = require('./processors/sitemap').plugin; const regex = { - css: /\.s?css$/, - html: /\.html?$/, + css: /\.s?css$/i, + html: /\.html?$/i, any: /(:?)/ }; -const getFiles = async (path) => { - const stat = promisify(fs.stat); - const stats = await stat(path); +const getFiles = async (dir, searchOptions) => { + const globPattern = searchOptions.globPattern || '/**/*.*'; + const { partition, ignorePatterns } = searchOptions; - if (stats.isSymbolicLink()) { - return []; + const fullGlobPattern = `${dir}/${globPattern}`.replace('//', '/'); + const globOptions = { ...ignorePatterns && { ignore: ignorePatterns } }; + + const files = glob.sync(fullGlobPattern, globOptions); + + if (!files.length) { + log.error('Could not find files 👀'); + return; } - if (stats && stats.isDirectory()) { - const files = glob.sync(path.concat('/**/*.*')); + const { split: _split, partition: _partitionNumber } = { + split: 1, partition: 1, ...partition + }; - const promises = files.map((file) => Path.resolve(file)); - const result = await Promise.all(promises); + const promises = files.map((file) => Path.resolve(file)); + const result = await Promise.all(promises); - return result; - } - return [path]; + const split = Math.max(_split || 1, 1); + const partitionNumber = Math.min(split, Math.max(_partitionNumber || 1, 1)); + const fileCount = result.length; + const partitionSize = fileCount / split; + const partitionStart = (partitionNumber - 1) * partitionSize; + const partitionEnd = partitionStart + partitionSize; + const filePartition = result.slice(partitionStart, partitionEnd); + + return filePartition; }; module.exports = { @@ -43,78 +57,47 @@ module.exports = { * Recursively and asynchronously moves through a directory and * returns the list of files in that directory. * @param {string} dir The current file directory. - * @param {string} [type='any'] The type of files to look for. - * - * Options: "any": returns all files in dir. - * - * "css": returns only .css and .scss files. - * - * "html": returns only .html and .htm files. - * - * "assets": returns all files that "css" and "html" does not return. * @param {object} [partition] Partition information for the fetch. * @param {number} partition.split The number of partitions to split into. * @param {number} partition.partition The partition number to process. * * @return {object} The files grouped by `css`, `html`, and `other`. */ - _fetchFiles: async function (dir, type = 'any', partition) { - try { - const { split: _split, partition: _partitionNumber } = { - split: 1, partition: 1, ...partition - }; - const filesByType = { - css: [], - html: [], - other: [] - }; - - const files = await getFiles(dir); - - const split = Math.max(_split || 1, 1); - const partitionNumber = Math.min(split, Math.max(_partitionNumber || 1, 1)); - const fileCount = files.length; - const partitionSize = fileCount / split; - const partitionStart = (partitionNumber - 1) * partitionSize; - const partitionEnd = partitionStart + partitionSize; - const filePartition = files.slice(partitionStart, partitionEnd); - - const doReturn = { - css: type === 'any' || type === 'css', - html: type === 'any' || type === 'html', - assets: type === 'any' || type === 'assets' - }; - - filePartition.forEach((file) => { - const ext = Path.extname(file); - - if (regex.css.test(ext) && doReturn.css) { - filesByType.css.push(file); - } else if (regex.html.test(ext) && doReturn.html) { - filesByType.html.push(file); - } else if (!regex.css.test(ext) && !regex.html.test(ext) && doReturn.assets) { - filesByType.other.push(file); - } - }); + _fetchAllFiles: async function (dir, partition) { + const filesByType = { + css: [], + html: [], + xml: [], + other: [] + }; - return filesByType; - } catch (err) { - log.error('Could not find files 👀'); - log.error(err); - return undefined; + const files = await getFiles(dir, { partition: partition }); + if (!files || !files.length) { + return; } + + files.forEach((file) => { + const ext = Path.extname(file); + + if (regex.css.test(ext)) { + filesByType.css.push(file); + } else if (regex.html.test(ext)) { + filesByType.html.push(file); + } else if (!regex.css.test(ext) && !regex.html.test(ext)) { + filesByType.other.push(file); + } + }); + + return filesByType; }, /** * Asynchronously copy the files in fileList from source to destination. - * CURRENTLY SLOWEST PART OF PROCESS. + * Currently slowest part of process. * * @param {string[]} fileList The list of files contained in the source. */ _copyFiles: async function (fileList, options) { - const mkdir = promisify(fs.mkdirs); - const copy = promisify(fs.copy); - if (!fileList) { log.error('no files to copy'); return undefined; @@ -124,7 +107,7 @@ module.exports = { const destination = options.paths.fullPathToDest; try { - await mkdir(destination, { recursive: true }); // create the directory cwd/destination + fs.mkdirsSync(destination, { recursive: true }); // create the directory cwd/destination } catch (err) { log.error('Could not create destination directory'); log.error(err); @@ -138,10 +121,10 @@ module.exports = { const newpath = Path.join(destination, stub); try { - await copy(file, newpath, { overwrite: true }); + fs.copySync(file, newpath, { overwrite: true }); copied.push(newpath); } catch (err) { - log.err(err); + log.error(err); } } return copied; @@ -186,7 +169,7 @@ module.exports = { return deletedError; // errored in clean } - const sourceFiles = await this._fetchFiles(options.paths.fullPathToSource, 'any', { + const sourceFiles = await this._fetchAllFiles(options.paths.fullPathToSource, { split: split, partition: partition }); if (!sourceFiles) { @@ -236,11 +219,11 @@ module.exports = { if (!files) { const { flags } = options; const { split, partition } = flags; - const fetchedFiles = await this._fetchFiles(options.paths.fullPathToSource, 'assets', { - split: split, partition: partition + files = await getFiles(options.paths.fullPathToSource, { + partition: { split: split, partition: partition }, + ignorePatterns: ['/**/*.htm?', '/**/*.css'] }); - if (!fetchedFiles) return 1; - files = fetchedFiles.other; + if (!files || !files.length) return 1; } const otherFiles = await this._copyFiles(files, options); @@ -282,11 +265,11 @@ module.exports = { const { split, partition } = flags; if (!files) { - const fetchedFiles = await this._fetchFiles(options.paths.fullPathToSource, 'css', { - split: split, partition: partition + files = await getFiles(options.paths.fullPathToSource, { + partition: { split: split, partition: partition }, + globPattern: '/**/*.css' }); - if (!fetchedFiles) return 1; - files = fetchedFiles.css; + if (!files || !files.length) return 1; } log('copying...'); @@ -320,11 +303,11 @@ module.exports = { const { split, partition } = flags; if (!files) { - const fetchedFiles = await this._fetchFiles(options.paths.fullPathToSource, 'html', { - split: split, partition: partition + files = await getFiles(options.paths.fullPathToSource, { + partition: { split: split, partition: partition }, + globPattern: '/**/*.htm?' }); - if (!fetchedFiles) return 1; - files = fetchedFiles.html; + if (!files || !files.length) return 1; } const copiedFiles = await this._copyFiles(files, options); if (!copiedFiles) { @@ -345,6 +328,57 @@ module.exports = { return 0; }, + /** + * Rewrites urls in sitemap file(s) to include baseurl. + * If files is truthy, then takes the files in files, copies them into dest/baseurl + * using copyFiles, and rewrites the contents so that urls and hrefs referencing local + * content have baseurl prepended to them. + * If files is null, then fetch-files is called first to obtain only the html files in src. + * + * @param {Object} options The options object. + * @param {[String]} files The list of files to rewrite (default = null). + * @returns {[String]} The copied files (TODO). + */ + rewrite_sitemap: async function (options, files = null) { + if (!files) { + files = await getFiles(options.paths.fullPathToSource, + { globPattern: options.paths.sitemap }); + if (!files || !files.length) return 1; + } + const copiedFiles = await this._copyFiles(files, options); + if (!copiedFiles) { + return 1; + } + + if (options.fromIndex) { + copiedFiles.forEach((file) => { + const exit = sitemapRewrite(file, options.paths.fullPathToDest, options.paths.baseurl); + if (exit > 0) return exit; // if error + }); + } else { + log('rewriting sitemap...'); + const file = copiedFiles[0]; + const exit = sitemapRewrite(file, options.paths.fullPathToDest, options.paths.baseurl); + if (Array.isArray(exit) && !options.fromIndex) { + options.fromIndex = true; + + const linkedFiles = exit.reduce((pathList, path) => { + try { + const url = new URL(path); + pathList.push(url.pathname); + return pathList; + } catch (urlError) { + return pathList; + } + }, []); + + return this.rewrite_sitemap(options, linkedFiles); + } + } + + return 0; + }, + /** * Serves the files on a local webserver, so that they may be viewed on a browser. * diff --git a/test/runner.test.js b/test/runner.test.js index e8f3492..b6e31a2 100644 --- a/test/runner.test.js +++ b/test/runner.test.js @@ -32,13 +32,14 @@ testOp.paths.fullPathToSource = testOp.paths.src; testOp.paths.fullPathToDest = path.resolve(testOp.paths.dest, 'baseurl'); Object.freeze(testOp); -describe('_fetchFiles', function () { +describe('_fetchAllFiles', function () { before(function () { mock({ testDir: { 'image.jpg': 'imgdata', 'style.css': 'css', 'index.html': 'html', + 'sitemap.xml': 'sitemap', emptyDir: { emptierDir: {} }, assets: { 'image2.jpg': 'imgdata' @@ -55,41 +56,12 @@ describe('_fetchFiles', function () { context('type = any', function () { it('should retrieve all files', async function () { - const results = await runner._fetchFiles('testDir', 'any'); + const results = await runner._fetchAllFiles('testDir'); expect(results.css.length).to.equal(2); - expect(results.other.length).to.equal(2); + expect(results.other.length).to.equal(3); expect(results.html.length).to.equal(2); }); }); - context('type = css', function () { - it('should retrieve all css files', async function () { - const results = await runner._fetchFiles('testDir', 'css'); - expect(results.css.length).to.equal(2); - expect(results.html.length).to.equal(0); - expect(results.other.length).to.equal(0); - expect(results.css.every((file) => path.extname(file) === '.css')).to.equal(true); - }); - }); - - context('type = html', function () { - it('should retrieve all html files', async function () { - const results = await runner._fetchFiles('testDir', 'html'); - expect(results.html.length).to.equal(2); - expect(results.css.length).to.equal(0); - expect(results.other.length).to.equal(0); - expect(results.html.every((file) => path.extname(file) === '.html')).to.equal(true); - }); - }); - - context('type = assets', function () { - it('should retrieve all files', async function () { - const results = await runner._fetchFiles('testDir', 'assets'); - expect(results.other.length).to.equal(2); - expect(results.css.length).to.equal(0); - expect(results.html.length).to.equal(0); - expect(results.other.every((file) => path.extname(file) === '.jpg')).to.equal(true); - }); - }); context('partitions', function () { const getPartitionFiles = (partition) => ( @@ -97,30 +69,30 @@ describe('_fetchFiles', function () { ); before(async function () { - this.defaultPartition = await runner._fetchFiles('testDir', 'any'); - this.partition1 = await runner._fetchFiles('testDir', 'any', { split: 2, partition: 1 }); - this.partition2 = await runner._fetchFiles('testDir', 'any', { split: 2, partition: 2 }); + this.defaultPartition = await runner._fetchAllFiles('testDir'); + this.partition1 = await runner._fetchAllFiles('testDir', { split: 2, partition: 1 }); + this.partition2 = await runner._fetchAllFiles('testDir', { split: 2, partition: 2 }); }); it('prevents invalid `split` or `partition` value', async function () { - const partition = await runner._fetchFiles('testDir', 'any', { split: 0, partition: 0 }); + const partition = await runner._fetchAllFiles('testDir', { split: 0, partition: 0 }); const files = getPartitionFiles(partition); - expect(files.length).to.equal(6); + expect(files.length).to.equal(7); }); it('prevents undefined `split` or `partition` value', async function () { - const partition = await runner._fetchFiles('testDir', 'any', { split: undefined, partition: undefined }); + const partition = await runner._fetchAllFiles('testDir', { split: undefined, partition: undefined }); const files = getPartitionFiles(partition); - expect(files.length).to.equal(6); + expect(files.length).to.equal(7); }); it('ensured `partition` is not greater than `split`', async function () { - const partition = await runner._fetchFiles('testDir', 'any', { split: 1, partition: 2 }); + const partition = await runner._fetchAllFiles('testDir', { split: 1, partition: 2 }); const files = getPartitionFiles(partition); - expect(files.length).to.equal(6); + expect(files.length).to.equal(7); }); it('should match default behaviour', async function () { @@ -136,7 +108,7 @@ describe('_fetchFiles', function () { const partition2Files = getPartitionFiles(this.partition2); expect(partition1Files.length).to.equal(3); - expect(partition2Files.length).to.equal(3); + expect(partition2Files.length).to.equal(4); }); it('should not have duplicate files', async function () { @@ -150,9 +122,8 @@ describe('_fetchFiles', function () { context('dir doesnt exist', function () { it('should throw an error', async function () { - const results = await runner._fetchFiles('test/fakeDir'); - expect(results).to.equal(undefined); - // expect(await function() {runner._fetchFiles('test/fakeDir')}).to.throw(); + const results = await runner._fetchAllFiles('test/fakeDir'); + expect(results).to.eq(undefined); }); }); @@ -228,7 +199,7 @@ describe('build', function () { before(function () { cleanStub = sinon.stub(runner, 'clean'); - fetchStub = sinon.stub(runner, '_fetchFiles'); + fetchStub = sinon.stub(runner, '_fetchAllFiles'); cloneAssetsStub = sinon.stub(runner, 'clone_assets'); rewriteCssStub = sinon.stub(runner, 'rewrite_css'); rewriteHtmlStub = sinon.stub(runner, 'rewrite_html'); @@ -493,12 +464,12 @@ describe('rewrite_html', function () { let copyFilesStub; before(function () { - fetchFileStub = sinon.stub(runner, '_fetchFiles'); + fetchFileStub = sinon.stub(runner, '_fetchAllFiles'); copyFilesStub = sinon.stub(runner, '_copyFiles'); }); context('No files to copy', function () { - context('_fetchFiles fails', function () { + context('_fetchAllFiles fails', function () { before(function () { fetchFileStub.returns(); }); @@ -563,6 +534,49 @@ describe('rewrite_html', function () { }); }); +describe('rewrite_sitemap()', function () { + context('', function () { + before(function () { + mock({ + 'test/src': { + 'sitemapindex.xml': ` + + + + http://example.org/sitemaps/pagelist.xml + 2016-11-11T00:00:00+13:00 + + + + http://example.org/sitemaps/morepages.xml + 2016-11-11T00:00:00+13:00 + + sitemap> + dudlink/map.xml + 2016-11-11T00:00:00+13:00 + + `, + sitemaps: { + 'pagelist.xml': 'xml', + 'morepages.xml': 'xml' + } + } + }); + }); + + it('should return the cloned files', async function () { + const options = cloneObject(testOp); + options.paths.sitemap = 'sitemapindex.xml'; + const results = await runner.rewrite_sitemap(options); + expect(results).to.equal(0); + }); + + after(function () { + mock.restore(); + }); + }); +}); + describe('serve', function () { }); diff --git a/test/xml.test.js b/test/xml.test.js new file mode 100644 index 0000000..9198e92 --- /dev/null +++ b/test/xml.test.js @@ -0,0 +1,114 @@ +/* eslint-disable prefer-arrow-callback */ +const { expect } = require('chai'); +const mock = require('mock-fs'); +const xmlRewrite = require('../lib/processors/sitemap'); + +describe('rewrite xml', function () { + context('elements with some src attribute', function () { + it('should rewrite the url in each loc node', function () { + const xmlString = ` + + + + http://example.org/advice/ + 2016-11-11T00:00:00+13:00 + + + + + + http://example.org/about/our-people/sam + 2016-11-11T00:00:00+13:00 + + `; + const expectedXmlString = ` + + + + http://example.org/testBaseurl/advice/ + 2016-11-11T00:00:00+13:00 + + + + + + http://example.org/testBaseurl/about/our-people/sam + 2016-11-11T00:00:00+13:00 + + `; + const rewrittenElement = xmlRewrite.rewrite(xmlString, 'testBaseurl'); + expect(rewrittenElement).to.equal(expectedXmlString); + }); + }); +}); + +describe('plugin', function () { + before(function () { + mock({ + 'sitemapindex.xml': ` + + + + http://example.org/sitemaps/pagelist.xml + 2016-11-11T00:00:00+13:00 + + + + http://example.org/sitemaps/morepages.xml + 2016-11-11T00:00:00+13:00 + + `, + emptySitemap: { + 'sitemap.xml': '' + }, + testDir: { + 'sitemap.xml': '' + } + }); + }); + + context('elements with some src attribute', function () { + it('should rewrite the url in each loc node', function () { + const fileList = xmlRewrite.plugin('sitemapindex.xml', 'testDir/destTest', 'testbase'); + expect(fileList).to.deep.equal(['http://example.org/sitemaps/pagelist.xml', 'http://example.org/sitemaps/morepages.xml']); + }); + }); + + context('valid sitemap file', function () { + it('Should return 0', function () { + const file = 'testDir/sitemap.xml'; + const destTest = 'testDir/destTest'; + expect(xmlRewrite.plugin(file, destTest, 'testbase')).to.equal(0); + }); + }); + + context('empty sitemap file', function () { + it('Should return 0', function () { + const file = 'emptySitemap/sitemap.xml'; + const destTest = 'emptySitemap/destTest'; + expect(xmlRewrite.plugin(file, destTest, 'testbase')).to.equal(0); + }); + }); + + context('No file specified', function () { + it('should return 1', function () { + expect(xmlRewrite.plugin('', 'dest', 'testbase')).to.equal(1); + }); + }); + + context('No destination specified', function () { + it('Should return 1', function () { + expect(xmlRewrite.plugin('filename', '', 'baseurl')).to.equal(1); + }); + }); + + context('No baseurl', function () { + it('should return 1', function () { + expect(xmlRewrite.plugin('filename', 'dest', null)).to.equal(1); + }); + }); + + after(function () { + mock.restore(); + }); +});