From ac950f16b7dfcdb604c4452dfa2427201cc58059 Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Mon, 19 Mar 2018 15:15:55 -0500 Subject: [PATCH 01/10] Chunk the requests to download media objects from WP. The blog I work on has over 9,000 media objects and currently, it tries to download them all. This PR chunks them in groups of 100, but that setting can be increased. --- packages/gatsby-source-wordpress/README.md | 57 +++++------ .../src/gatsby-node.js | 2 + .../gatsby-source-wordpress/src/normalize.js | 32 ++++++- yarn.lock | 95 ++++++++++++++++++- 4 files changed, 156 insertions(+), 30 deletions(-) diff --git a/packages/gatsby-source-wordpress/README.md b/packages/gatsby-source-wordpress/README.md index fb74b438842a6..b822a547badb1 100644 --- a/packages/gatsby-source-wordpress/README.md +++ b/packages/gatsby-source-wordpress/README.md @@ -42,16 +42,16 @@ We welcome PRs adding support for data from other plugins. // In your gatsby-config.js plugins: [ /* - * Gatsby's data processing layer begins with “source” - * plugins. Here the site sources its data from Wordpress. - */ + * Gatsby's data processing layer begins with “source” + * plugins. Here the site sources its data from Wordpress. + */ { resolve: "gatsby-source-wordpress", options: { /* - * The base URL of the Wordpress site without the trailingslash and the protocol. This is required. - * Example : 'gatsbyjsexamplewordpress.wordpress.com' or 'www.example-site.com' - */ + * The base URL of the Wordpress site without the trailingslash and the protocol. This is required. + * Example : 'gatsbyjsexamplewordpress.wordpress.com' or 'www.example-site.com' + */ baseUrl: "gatsbyjsexamplewordpress.wordpress.com", // The protocol. This can be http or https. protocol: "http", @@ -88,9 +88,12 @@ plugins: [ sourceUrl: "https://source-url.com", replacementUrl: "https://replacement-url.com", }, + // How many media objects should be downloaded in parallel. Higher = faster + // OSX defaults to 256 max open connections, you cannot surpass that unless you increase the restriction + chunkSize: 100, }, }, -]; +] ``` ## WordPress Plugins @@ -470,10 +473,10 @@ To learn more about image processing check ## Site's `gatsby-node.js` example ```javascript -const _ = require(`lodash`); -const Promise = require(`bluebird`); -const path = require(`path`); -const slash = require(`slash`); +const _ = require(`lodash`) +const Promise = require(`bluebird`) +const path = require(`path`) +const slash = require(`slash`) // Implement the Gatsby API “createPages”. This is // called after the Gatsby bootstrap is finished so you have @@ -482,7 +485,7 @@ const slash = require(`slash`); // Will create pages for WordPress pages (route : /{slug}) // Will create pages for WordPress posts (route : /post/{slug}) exports.createPages = ({ graphql, boundActionCreators }) => { - const { createPage } = boundActionCreators; + const { createPage } = boundActionCreators return new Promise((resolve, reject) => { // The “graphql” function allows us to run arbitrary // queries against the local WordPress graphql schema. Think of @@ -508,12 +511,12 @@ exports.createPages = ({ graphql, boundActionCreators }) => { ) .then(result => { if (result.errors) { - console.log(result.errors); - reject(result.errors); + console.log(result.errors) + reject(result.errors) } // Create Page pages. - const pageTemplate = path.resolve("./src/templates/page.js"); + const pageTemplate = path.resolve("./src/templates/page.js") // We want to create a detailed page for each // page node. We'll just use the WordPress Slug for the slug. // The Page ID is prefixed with 'PAGE_' @@ -531,8 +534,8 @@ exports.createPages = ({ graphql, boundActionCreators }) => { context: { id: edge.node.id, }, - }); - }); + }) + }) }) // ==== END PAGES ==== @@ -556,10 +559,10 @@ exports.createPages = ({ graphql, boundActionCreators }) => { ` ).then(result => { if (result.errors) { - console.log(result.errors); - reject(result.errors); + console.log(result.errors) + reject(result.errors) } - const postTemplate = path.resolve("./src/templates/post.js"); + const postTemplate = path.resolve("./src/templates/post.js") // We want to create a detailed page for each // post node. We'll just use the WordPress Slug for the slug. // The Post ID is prefixed with 'POST_' @@ -570,12 +573,12 @@ exports.createPages = ({ graphql, boundActionCreators }) => { context: { id: edge.node.id, }, - }); - }); - resolve(); - }); - }); + }) + }) + resolve() + }) + }) // ==== END POSTS ==== - }); -}; + }) +} ``` diff --git a/packages/gatsby-source-wordpress/src/gatsby-node.js b/packages/gatsby-source-wordpress/src/gatsby-node.js index 37ad22f50370f..24a6667051f16 100644 --- a/packages/gatsby-source-wordpress/src/gatsby-node.js +++ b/packages/gatsby-source-wordpress/src/gatsby-node.js @@ -28,6 +28,7 @@ exports.sourceNodes = async ( verboseOutput, perPage = 100, searchAndReplaceContentUrls = {}, + chunkSize = 100, } ) => { const { createNode } = boundActionCreators @@ -92,6 +93,7 @@ exports.sourceNodes = async ( cache, createNode, _auth, + chunkSize, }) // Search and replace Content Urls diff --git a/packages/gatsby-source-wordpress/src/normalize.js b/packages/gatsby-source-wordpress/src/normalize.js index 59cdd1bc32b04..b041311f409da 100644 --- a/packages/gatsby-source-wordpress/src/normalize.js +++ b/packages/gatsby-source-wordpress/src/normalize.js @@ -379,7 +379,7 @@ exports.mapEntitiesToMedia = entities => { } // Downloads media files and removes "sizes" data as useless in Gatsby context. -exports.downloadMediaFiles = async ({ +const downloadMediaFilesChunk = async ({ entities, store, cache, @@ -412,6 +412,36 @@ exports.downloadMediaFiles = async ({ }) ) +// chunk the entities from the wordpress /media/ endpoint and process them as noted above +exports.downloadMediaFiles = async ({ + entities, + store, + cache, + createNode, + _auth, + chunkSize, +}) => { + const chunks = _.chunk(entities, chunkSize) + let processed = [] + + while (chunks.length) { + const chunk = chunks.shift() + + const entitiesChunk = await downloadMediaFilesChunk({ + entities: chunk, + store, + cache, + createNode, + _auth, + chunkSize, + }) + + processed = processed.concat(entitiesChunk) + } + + return processed +} + const prepareACFChildNodes = ( obj, entityId, diff --git a/yarn.lock b/yarn.lock index db242834436d2..84b57e0e30a62 100644 --- a/yarn.lock +++ b/yarn.lock @@ -10,6 +10,12 @@ esutils "^2.0.2" js-tokens "^3.0.0" +"@babel/code-frame@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.0.0-beta.42.tgz#a9c83233fa7cd06b39dc77adbb908616ff4f1962" + dependencies: + "@babel/highlight" "7.0.0-beta.42" + "@babel/code-frame@^7.0.0-beta.35": version "7.0.0-beta.39" resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.0.0-beta.39.tgz#91c90bb65207fc5a55128cb54956ded39e850457" @@ -18,6 +24,16 @@ esutils "^2.0.2" js-tokens "^3.0.0" +"@babel/generator@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.0.0-beta.42.tgz#777bb50f39c94a7e57f73202d833141f8159af33" + dependencies: + "@babel/types" "7.0.0-beta.42" + jsesc "^2.5.1" + lodash "^4.2.0" + source-map "^0.5.0" + trim-right "^1.0.1" + "@babel/helper-function-name@7.0.0-beta.36": version "7.0.0-beta.36" resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.0.0-beta.36.tgz#366e3bc35147721b69009f803907c4d53212e88d" @@ -26,12 +42,47 @@ "@babel/template" "7.0.0-beta.36" "@babel/types" "7.0.0-beta.36" +"@babel/helper-function-name@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.0.0-beta.42.tgz#b38b8f4f85168d1812c543dd700b5d549b0c4658" + dependencies: + "@babel/helper-get-function-arity" "7.0.0-beta.42" + "@babel/template" "7.0.0-beta.42" + "@babel/types" "7.0.0-beta.42" + "@babel/helper-get-function-arity@7.0.0-beta.36": version "7.0.0-beta.36" resolved "https://registry.yarnpkg.com/@babel/helper-get-function-arity/-/helper-get-function-arity-7.0.0-beta.36.tgz#f5383bac9a96b274828b10d98900e84ee43e32b8" dependencies: "@babel/types" "7.0.0-beta.36" +"@babel/helper-get-function-arity@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/helper-get-function-arity/-/helper-get-function-arity-7.0.0-beta.42.tgz#ad072e32f912c033053fc80478169aeadc22191e" + dependencies: + "@babel/types" "7.0.0-beta.42" + +"@babel/helper-split-export-declaration@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.0.0-beta.42.tgz#0d0d5254220a9cc4e7e226240306b939dc210ee7" + dependencies: + "@babel/types" "7.0.0-beta.42" + +"@babel/highlight@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.0.0-beta.42.tgz#a502a1c0d6f99b2b0e81d468a1b0c0e81e3f3623" + dependencies: + chalk "^2.0.0" + esutils "^2.0.2" + js-tokens "^3.0.0" + +"@babel/runtime@^7.0.0-beta.40": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/runtime/-/runtime-7.0.0-beta.42.tgz#352e40c92e0460d3e82f49bd7e79f6cda76f919f" + dependencies: + core-js "^2.5.3" + regenerator-runtime "^0.11.1" + "@babel/template@7.0.0-beta.36": version "7.0.0-beta.36" resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.0.0-beta.36.tgz#02e903de5d68bd7899bce3c5b5447e59529abb00" @@ -41,6 +92,15 @@ babylon "7.0.0-beta.36" lodash "^4.2.0" +"@babel/template@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.0.0-beta.42.tgz#7186d4e70d44cdec975049ba0a73bdaf5cdee052" + dependencies: + "@babel/code-frame" "7.0.0-beta.42" + "@babel/types" "7.0.0-beta.42" + babylon "7.0.0-beta.42" + lodash "^4.2.0" + "@babel/traverse@7.0.0-beta.36": version "7.0.0-beta.36" resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.0.0-beta.36.tgz#1dc6f8750e89b6b979de5fe44aa993b1a2192261" @@ -54,6 +114,21 @@ invariant "^2.2.0" lodash "^4.2.0" +"@babel/traverse@^7.0.0-beta.40": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.0.0-beta.42.tgz#f4bf4d1e33d41baf45205e2d0463591d57326285" + dependencies: + "@babel/code-frame" "7.0.0-beta.42" + "@babel/generator" "7.0.0-beta.42" + "@babel/helper-function-name" "7.0.0-beta.42" + "@babel/helper-split-export-declaration" "7.0.0-beta.42" + "@babel/types" "7.0.0-beta.42" + babylon "7.0.0-beta.42" + debug "^3.1.0" + globals "^11.1.0" + invariant "^2.2.0" + lodash "^4.2.0" + "@babel/types@7.0.0-beta.36": version "7.0.0-beta.36" resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.0.0-beta.36.tgz#64f2004353de42adb72f9ebb4665fc35b5499d23" @@ -62,6 +137,14 @@ lodash "^4.2.0" to-fast-properties "^2.0.0" +"@babel/types@7.0.0-beta.42": + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.0.0-beta.42.tgz#1e2118767684880f6963801b272fd2b3348efacc" + dependencies: + esutils "^2.0.2" + lodash "^4.2.0" + to-fast-properties "^2.0.0" + "@types/configstore@^2.1.1": version "2.1.1" resolved "https://registry.yarnpkg.com/@types/configstore/-/configstore-2.1.1.tgz#cd1e8553633ad3185c3f2f239ecff5d2643e92b6" @@ -1592,6 +1675,10 @@ babylon@7.0.0-beta.36: version "7.0.0-beta.36" resolved "https://registry.yarnpkg.com/babylon/-/babylon-7.0.0-beta.36.tgz#3a3683ba6a9a1e02b0aa507c8e63435e39305b9e" +babylon@7.0.0-beta.42, babylon@^7.0.0-beta.40: + version "7.0.0-beta.42" + resolved "https://registry.yarnpkg.com/babylon/-/babylon-7.0.0-beta.42.tgz#67cfabcd4f3ec82999d29031ccdea89d0ba99657" + babylon@^6.17.2, babylon@^6.17.3, babylon@^6.18.0: version "6.18.0" resolved "https://registry.yarnpkg.com/babylon/-/babylon-6.18.0.tgz#af2f3b88fa6f5c1e4c634d1a0f8eac4f55b395e3" @@ -3042,7 +3129,7 @@ core-js@^1.0.0, core-js@^1.2.6: version "1.2.7" resolved "https://registry.yarnpkg.com/core-js/-/core-js-1.2.7.tgz#652294c14651db28fa93bd2d5ff2983a4f08c636" -core-js@^2.4.0, core-js@^2.4.1, core-js@^2.5.0, core-js@^2.5.1: +core-js@^2.4.0, core-js@^2.4.1, core-js@^2.5.0, core-js@^2.5.1, core-js@^2.5.3: version "2.5.3" resolved "https://registry.yarnpkg.com/core-js/-/core-js-2.5.3.tgz#8acc38345824f16d8365b7c9b4259168e8ed603e" @@ -7346,6 +7433,10 @@ jsesc@^1.3.0: version "1.3.0" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-1.3.0.tgz#46c3fec8c1892b12b0833db9bc7622176dbab34b" +jsesc@^2.5.1: + version "2.5.1" + resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-2.5.1.tgz#e421a2a8e20d6b0819df28908f782526b96dd1fe" + jsesc@~0.5.0: version "0.5.0" resolved "https://registry.yarnpkg.com/jsesc/-/jsesc-0.5.0.tgz#e7dee66e35d6fc16f710fe91d5cf69f70f08911d" @@ -11348,7 +11439,7 @@ regenerator-runtime@^0.10.5: version "0.10.5" resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.10.5.tgz#336c3efc1220adcedda2c9fab67b5a7955a33658" -regenerator-runtime@^0.11.0: +regenerator-runtime@^0.11.0, regenerator-runtime@^0.11.1: version "0.11.1" resolved "https://registry.yarnpkg.com/regenerator-runtime/-/regenerator-runtime-0.11.1.tgz#be05ad7f9bf7d22e056f9726cee5017fbf19e2e9" From 638cf43195279e6091cffc949fc72f07c25e4ce4 Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Mon, 19 Mar 2018 15:22:55 -0500 Subject: [PATCH 02/10] Remove prettier forrmatting from the readme --- packages/gatsby-source-wordpress/README.md | 54 +++++++++++----------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/packages/gatsby-source-wordpress/README.md b/packages/gatsby-source-wordpress/README.md index b822a547badb1..ccb7683321568 100644 --- a/packages/gatsby-source-wordpress/README.md +++ b/packages/gatsby-source-wordpress/README.md @@ -42,16 +42,16 @@ We welcome PRs adding support for data from other plugins. // In your gatsby-config.js plugins: [ /* - * Gatsby's data processing layer begins with “source” - * plugins. Here the site sources its data from Wordpress. - */ + * Gatsby's data processing layer begins with “source” + * plugins. Here the site sources its data from Wordpress. + */ { resolve: "gatsby-source-wordpress", options: { /* - * The base URL of the Wordpress site without the trailingslash and the protocol. This is required. - * Example : 'gatsbyjsexamplewordpress.wordpress.com' or 'www.example-site.com' - */ + * The base URL of the Wordpress site without the trailingslash and the protocol. This is required. + * Example : 'gatsbyjsexamplewordpress.wordpress.com' or 'www.example-site.com' + */ baseUrl: "gatsbyjsexamplewordpress.wordpress.com", // The protocol. This can be http or https. protocol: "http", @@ -93,7 +93,7 @@ plugins: [ chunkSize: 100, }, }, -] +]; ``` ## WordPress Plugins @@ -473,10 +473,10 @@ To learn more about image processing check ## Site's `gatsby-node.js` example ```javascript -const _ = require(`lodash`) -const Promise = require(`bluebird`) -const path = require(`path`) -const slash = require(`slash`) +const _ = require(`lodash`); +const Promise = require(`bluebird`); +const path = require(`path`); +const slash = require(`slash`); // Implement the Gatsby API “createPages”. This is // called after the Gatsby bootstrap is finished so you have @@ -485,7 +485,7 @@ const slash = require(`slash`) // Will create pages for WordPress pages (route : /{slug}) // Will create pages for WordPress posts (route : /post/{slug}) exports.createPages = ({ graphql, boundActionCreators }) => { - const { createPage } = boundActionCreators + const { createPage } = boundActionCreators; return new Promise((resolve, reject) => { // The “graphql” function allows us to run arbitrary // queries against the local WordPress graphql schema. Think of @@ -511,12 +511,12 @@ exports.createPages = ({ graphql, boundActionCreators }) => { ) .then(result => { if (result.errors) { - console.log(result.errors) - reject(result.errors) + console.log(result.errors); + reject(result.errors); } // Create Page pages. - const pageTemplate = path.resolve("./src/templates/page.js") + const pageTemplate = path.resolve("./src/templates/page.js"); // We want to create a detailed page for each // page node. We'll just use the WordPress Slug for the slug. // The Page ID is prefixed with 'PAGE_' @@ -534,8 +534,8 @@ exports.createPages = ({ graphql, boundActionCreators }) => { context: { id: edge.node.id, }, - }) - }) + }); + }); }) // ==== END PAGES ==== @@ -559,10 +559,10 @@ exports.createPages = ({ graphql, boundActionCreators }) => { ` ).then(result => { if (result.errors) { - console.log(result.errors) - reject(result.errors) + console.log(result.errors); + reject(result.errors); } - const postTemplate = path.resolve("./src/templates/post.js") + const postTemplate = path.resolve("./src/templates/post.js"); // We want to create a detailed page for each // post node. We'll just use the WordPress Slug for the slug. // The Post ID is prefixed with 'POST_' @@ -573,12 +573,12 @@ exports.createPages = ({ graphql, boundActionCreators }) => { context: { id: edge.node.id, }, - }) - }) - resolve() - }) - }) + }); + }); + resolve(); + }); + }); // ==== END POSTS ==== - }) -} + }); +}; ``` From 4c30e8bdaad4ff78970726dd55c233f14606bdf7 Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Fri, 23 Mar 2018 14:41:27 -0500 Subject: [PATCH 03/10] Clean up and document create-remote-file-node Add Better Queue for more control over processing --- .../src/create-remote-file-node.js | 351 +++++++++++++----- .../src/get-max-file-lock.js | 20 + 2 files changed, 281 insertions(+), 90 deletions(-) create mode 100644 packages/gatsby-source-filesystem/src/get-max-file-lock.js diff --git a/packages/gatsby-source-filesystem/src/create-remote-file-node.js b/packages/gatsby-source-filesystem/src/create-remote-file-node.js index 9908bd4f7576b..d8fc32b7cd3d5 100644 --- a/packages/gatsby-source-filesystem/src/create-remote-file-node.js +++ b/packages/gatsby-source-filesystem/src/create-remote-file-node.js @@ -3,15 +3,270 @@ const got = require(`got`) const crypto = require(`crypto`) const path = require(`path`) const { isWebUri } = require(`valid-url`) +const Queue = require(`better-queue`) const { createFileNode } = require(`./create-file-node`) +const getMaxFileLock = require(`./get-max-file-lock`) const cacheId = url => `create-remote-file-node-${url}` +/******************** + * Type Definitions * + ********************/ + +/** + * @typedef {Redux} + * @see [Redux Docs]{@link https://redux.js.org/api-reference} + */ + +/** + * @typedef {GatsbyCache} + * @see gatsby/packages/gatsby/utils/cache.js + */ + +/** + * @typedef {Auth} + * @type {Object} + * @property {String} htaccess_pass + * @property {String} htaccess_user + */ + +/** + * @typedef {CRFNPayload} + * @typedef {Object} + * @description Create Remote File Node Payload + * + * @param {String} options.url + * @param {Redux} options.store + * @param {GatsbyCache} options.cache + * @param {Function} options.createNode + * @param {Auth} [options.auth] + */ + +/********* + * utils * + *********/ + +/** + * createHash + * -- + * + * Create an md5 hash of the given str + * @param {Stringq} str + * @return {String} + */ +const createHash = (str) => crypto + .createHash(`md5`) + .update(str) + .digest(`hex`) + + +const CACHE_DIR = `.cache` +const FS_PLUGIN_DIR = `gatsby-source-filesystem` + +/** + * createFilePath + * -- + * + * @param {String} directory + * @param {String} filename + * @param {String} url + * @return {String} + */ +const createFilePath = (directory, filename, ext) => path.join( + directory, + CACHE_DIR, + FS_PLUGIN_DIR, + `${filename}${ext}` +) + + +/******************** + * Queue Management * + ********************/ + +/** + * Queue + * Use the task's url as the id + * When pushing a task with a similar id, prefer the original task + * as it's already in the processing cache + */ +const queue = new Queue(pushToQueue, { + id: `url`, + merge: (old, _, cb) => cb(old), + batchSize: 200, +}) + +// Detetmine the max file descriptors on the users machine +// Then set the batch size to be 3/4 of that becuase the user +// will most likely have files open already +getMaxFileLock().then((max) => { + queue.batchSize = max * .75 +}) + +/** + * @callback {Queue~queueCallback} + * @param {*} error + * @param {*} result + */ + +/** + * pushToQueue + * -- + * Handle tasks that are pushed in to the Queue + * + * + * @param {CRFNPayload} task + * @param {Queue~queueCallback} cb + * @return {Promise} + */ +async function pushToQueue (task, cb) { + const node = await processRemoteNode(task) + return cb(null, node) +} + + +/****************** + * Core Functions * + ******************/ + +/** + * requestRemoteNode + * -- + * Download the requested file + * + * @param {String} url + * @param {Headers} headers + * @param {String} tmpFilename + * @param {String} filename + * @return {Promise} Resolves with the [http Result Object]{@link https://nodejs.org/api/http.html#http_class_http_serverresponse} + */ +const requestRemoteNode = (url, headers, tmpFilename, filename) => new Promise((resolve, reject) => { + let responseError = false + const responseStream = got.stream(url, headers) + responseStream.pipe(fs.createWriteStream(tmpFilename)) + responseStream.on(`downloadProgress`, pro => console.log(pro)) + + // If there's a 400/500 response or other error. + responseStream.on(`error`, (error, body, response) => { + responseError = true + fs.removeSync(tmpFilename) + reject({ error, body, response }) + }) + + responseStream.on(`end`, response => { + if (responseError) return + + resolve(response) + }) +}) + + +/** + * processRemoteNode + * -- + * Request the remote file and return the fileNode + * + * @param {CRFNPayload} options + * @return {Promise} Resolves with the fileNode + */ +async function processRemoteNode ({ url, store, cache, createNode, auth = {} }) { + // Ensure our cache directory exists. + const programDir = store.getState().program.directory + await fs.ensureDir( + path.join( + programDir, + `.cache`, + `gatsby-source-filesystem` + ) + ) + + // See if there's response headers for this url + // from a previous request. + const cachedHeaders = await cache.get(cacheId(url)) + const headers = {} + + // Add htaccess authentication if passed in. This isn't particularly + // extensible. We should define a proper API that we validate. + if (auth && auth.htaccess_pass && auth.htaccess_user) { + headers.auth = `${auth.htaccess_user}:${auth.htaccess_pass}` + } + + if (cachedHeaders && cachedHeaders.etag) { + headers[`If-None-Match`] = cachedHeaders.etag + } + + // Create the temp and permanent file names for the url. + const digest = createHash(url) + const ext = path.parse(url).ext + + const tmpFilename = createFilePath(programDir, `tmp-${digest}`, ext) + const filename = createFilePath(programDir, digest, ext) + + // Fetch the file. + // Let the consumer handle thrown errors + const response = await requestRemoteNode(url, headers, tmpFilename, filename) + + // Save the response headers for future requests. + cache.set(cacheId(url), response.headers) + + // If the status code is 200, move the piped temp file to the real name. + if (response.statusCode === 200) { + await fs.move(tmpFilename, filename, { overwrite: true }) + // Else if 304, remove the empty response. + } else { + await fs.remove(tmpFilename) + } + + // Create the file node. + const fileNode = await createFileNode(filename, {}) + + // Override the default plugin as gatsby-source-filesystem needs to + // be the owner of File nodes or there'll be conflicts if any other + // File nodes are created through normal usages of + // gatsby-source-filesystem. + createNode(fileNode, { name: `gatsby-source-filesystem` }) + + return fileNode +} + + /** * Index of promises resolving to File node from remote url */ const processingCache = {} +/** + * pushTask + * -- + * pushes a task in to the Queue and the processing cache + * + * Promisfy a task in queue + * @param {CRFNPayload} task + * @return {Promise} + */ +const pushTask = (task) => new Promise((resolve, reject) => { + queue + .push(task) + .on(`finish`, resolve) + .on(`failed`, reject) +}) + + +/*************** + * Entry Point * + ***************/ + +/** + * createRemoteFileNode + * -- + * + * Download a remote file + * First checks cache to ensure duplicate requests aren't processed + * Then pushes to a queue + * + * @param {CRFNPayload} options + * @return {Promise} Returns the created node + */ module.exports = ({ url, store, cache, createNode, auth = {} }) => { // Check if we already requested node for this remote file // and return stored promise if we did. @@ -19,96 +274,12 @@ module.exports = ({ url, store, cache, createNode, auth = {} }) => { return processingCache[url] } - return (processingCache[url] = new Promise(async (resolve, reject) => { - if (!url || isWebUri(url) === undefined) { - resolve() - return - } - - // Ensure our cache directory exists. - await fs.ensureDir( - path.join( - store.getState().program.directory, - `.cache`, - `gatsby-source-filesystem` - ) - ) - // See if there's response headers for this url - // from a previous request. - const cachedHeaders = await cache.get(cacheId(url)) - const headers = {} - - // Add htaccess authentication if passed in. This isn't particularly - // extensible. We should define a proper API that we validate. - if (auth && auth.htaccess_pass && auth.htaccess_user) { - headers.auth = `${auth.htaccess_user}:${auth.htaccess_pass}` - } - - if (cachedHeaders && cachedHeaders.etag) { - headers[`If-None-Match`] = cachedHeaders.etag - } - - // Create the temp and permanent file names for the url. - const digest = crypto - .createHash(`md5`) - .update(url) - .digest(`hex`) - const tmpFilename = path.join( - store.getState().program.directory, - `.cache`, - `gatsby-source-filesystem`, - `tmp-` + digest + path.parse(url).ext - ) - const filename = path.join( - store.getState().program.directory, - `.cache`, - `gatsby-source-filesystem`, - digest + path.parse(url).ext - ) + if (!url || isWebUri(url) === undefined) { + // should we resolve here, or reject? + // Technically, it's invalid input + return Promise.resolve() + } - // Fetch the file. - let statusCode - let responseHeaders - let responseError = false - const responseStream = got.stream(url, headers) - responseStream.pipe(fs.createWriteStream(tmpFilename)) - responseStream.on(`downloadProgress`, pro => console.log(pro)) - - // If there's a 400/500 response or other error. - responseStream.on(`error`, (error, body, response) => { - responseError = true - fs.removeSync(tmpFilename) - reject(error, body, response) - }) - - // If the status code is 200, move the piped temp file to the real name. - // Else if 304, remove the empty response. - responseStream.on(`response`, response => { - statusCode = response.statusCode - responseHeaders = response.headers - }) - - responseStream.on(`end`, response => { - if (responseError) return - - // Save the response headers for future requests. - cache.set(cacheId(url), responseHeaders) - if (statusCode === 200) { - fs.moveSync(tmpFilename, filename, { overwrite: true }) - } else { - fs.removeSync(tmpFilename) - } - - // Create the file node and return. - createFileNode(filename, {}).then(fileNode => { - // Override the default plugin as gatsby-source-filesystem needs to - // be the owner of File nodes or there'll be conflicts if any other - // File nodes are created through normal usages of - // gatsby-source-filesystem. - createNode(fileNode, { name: `gatsby-source-filesystem` }) - resolve(fileNode) - }) - }) - })) + return (processingCache[url] = pushTask({ url, store, cache, createNode, auth })) } diff --git a/packages/gatsby-source-filesystem/src/get-max-file-lock.js b/packages/gatsby-source-filesystem/src/get-max-file-lock.js new file mode 100644 index 0000000000000..e7ab59fe1705b --- /dev/null +++ b/packages/gatsby-source-filesystem/src/get-max-file-lock.js @@ -0,0 +1,20 @@ +const { spawn } = require(`child_process`) + +module.exports = function () { + return new Promise((resolve, reject) => { + const cp = spawn(`ulimit`, [`-n`]) + + let output = `` + cp.stdout.setEncoding(`utf8`) + + cp.stdout.on(`data`, data => output += data) + cp.stdout.on(`exit`, code => { + // If the ulimit command fails, or doesn't exist in the environment, + // fall back to a safe number + // This will most likely happen on Windows + if (code != 0) { resolve(256) } + + resolve(parseInt(output, 10)) + }) + }) +} From b8e5fe6aed7503e8d273843a8839881efedb4d4a Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Fri, 23 Mar 2018 14:47:16 -0500 Subject: [PATCH 04/10] Rollback changes to wp source files --- packages/gatsby-source-wordpress/README.md | 3 -- .../src/gatsby-node.js | 2 -- .../gatsby-source-wordpress/src/normalize.js | 32 +------------------ 3 files changed, 1 insertion(+), 36 deletions(-) diff --git a/packages/gatsby-source-wordpress/README.md b/packages/gatsby-source-wordpress/README.md index ccb7683321568..fb74b438842a6 100644 --- a/packages/gatsby-source-wordpress/README.md +++ b/packages/gatsby-source-wordpress/README.md @@ -88,9 +88,6 @@ plugins: [ sourceUrl: "https://source-url.com", replacementUrl: "https://replacement-url.com", }, - // How many media objects should be downloaded in parallel. Higher = faster - // OSX defaults to 256 max open connections, you cannot surpass that unless you increase the restriction - chunkSize: 100, }, }, ]; diff --git a/packages/gatsby-source-wordpress/src/gatsby-node.js b/packages/gatsby-source-wordpress/src/gatsby-node.js index 24a6667051f16..37ad22f50370f 100644 --- a/packages/gatsby-source-wordpress/src/gatsby-node.js +++ b/packages/gatsby-source-wordpress/src/gatsby-node.js @@ -28,7 +28,6 @@ exports.sourceNodes = async ( verboseOutput, perPage = 100, searchAndReplaceContentUrls = {}, - chunkSize = 100, } ) => { const { createNode } = boundActionCreators @@ -93,7 +92,6 @@ exports.sourceNodes = async ( cache, createNode, _auth, - chunkSize, }) // Search and replace Content Urls diff --git a/packages/gatsby-source-wordpress/src/normalize.js b/packages/gatsby-source-wordpress/src/normalize.js index b041311f409da..59cdd1bc32b04 100644 --- a/packages/gatsby-source-wordpress/src/normalize.js +++ b/packages/gatsby-source-wordpress/src/normalize.js @@ -379,7 +379,7 @@ exports.mapEntitiesToMedia = entities => { } // Downloads media files and removes "sizes" data as useless in Gatsby context. -const downloadMediaFilesChunk = async ({ +exports.downloadMediaFiles = async ({ entities, store, cache, @@ -412,36 +412,6 @@ const downloadMediaFilesChunk = async ({ }) ) -// chunk the entities from the wordpress /media/ endpoint and process them as noted above -exports.downloadMediaFiles = async ({ - entities, - store, - cache, - createNode, - _auth, - chunkSize, -}) => { - const chunks = _.chunk(entities, chunkSize) - let processed = [] - - while (chunks.length) { - const chunk = chunks.shift() - - const entitiesChunk = await downloadMediaFilesChunk({ - entities: chunk, - store, - cache, - createNode, - _auth, - chunkSize, - }) - - processed = processed.concat(entitiesChunk) - } - - return processed -} - const prepareACFChildNodes = ( obj, entityId, From fd277de32f99b8af2946f68514f09edd799b8718 Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Fri, 23 Mar 2018 15:09:46 -0500 Subject: [PATCH 05/10] Add queue for requesting wp objects update readme with new config option --- packages/gatsby-source-wordpress/README.md | 4 ++ .../src/__tests__/request-in-queue.js | 49 ++++++++++++++++ packages/gatsby-source-wordpress/src/fetch.js | 29 +++++----- .../src/gatsby-node.js | 2 + .../src/request-in-queue.js | 58 +++++++++++++++++++ 5 files changed, 129 insertions(+), 13 deletions(-) create mode 100644 packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js create mode 100644 packages/gatsby-source-wordpress/src/request-in-queue.js diff --git a/packages/gatsby-source-wordpress/README.md b/packages/gatsby-source-wordpress/README.md index fb74b438842a6..a873c1d1fc53e 100644 --- a/packages/gatsby-source-wordpress/README.md +++ b/packages/gatsby-source-wordpress/README.md @@ -88,6 +88,10 @@ plugins: [ sourceUrl: "https://source-url.com", replacementUrl: "https://replacement-url.com", }, + // How many requests should be sent out simultaneously. + // Lower this number if your blog is on a server with + // limitted resources. + concurrentRequests: 50 }, }, ]; diff --git a/packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js b/packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js new file mode 100644 index 0000000000000..4c1ffd9a8f9ce --- /dev/null +++ b/packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js @@ -0,0 +1,49 @@ +jest.mock(`axios`) + +const requestInQueue = require(`../request-in-queue`) +const axios = require(`axios`) + +axios.mockImplementation(opts => { + if (opts.throw) { throw new Error(opts.throw) } + + return opts.url.slice(opts.url.lastIndexOf(`/`) + 1) +}) + +describe(`requestInQueue`, () => { + let requests + + beforeEach(() => { + requests = [ + { method: `get`, url: `https://gatsbyjs.org/1` }, + { method: `get`, url: `https://gatsbyjs.org/2` }, + { method: `get`, url: `https://gatsbyjs.org/3` }, + { method: `get`, url: `https://gatsbyjs.org/4` }, + ] + }) + + afterEach(() => { + axios.mockClear() + }) + + it(`runs all requests in queue`, async () => { + await requestInQueue(requests) + + requests.forEach((req) => { + expect(axios).toHaveBeenCalledWith(req) + }) + }) + + it(`returns the values in the same order they were requested`, async () => { + const responses = await requestInQueue(requests) + expect(responses).toEqual([`1`, `2`, `3`, `4`]) + }) + + it(`stops any requests when one throws an error`, async () => { + try { + await requestInQueue([{ throw: `error` }, ...requests]) + } catch (err) { + expect(err).toBeDefined() + } + expect(axios).toHaveBeenCalledTimes(1) + }) +}) diff --git a/packages/gatsby-source-wordpress/src/fetch.js b/packages/gatsby-source-wordpress/src/fetch.js index 39ee4a9181d7b..6724019b38518 100644 --- a/packages/gatsby-source-wordpress/src/fetch.js +++ b/packages/gatsby-source-wordpress/src/fetch.js @@ -3,6 +3,7 @@ const axios = require(`axios`) const _ = require(`lodash`) const colorized = require(`./output-color`) const httpExceptionHandler = require(`./http-exception-handler`) +const requestInQueue = require(`./request-in-queue`) /** * High-level function to coordinate fetching data from a WordPress @@ -18,6 +19,7 @@ async function fetch({ baseUrl, typePrefix, refactoredEntityTypes, + concurrentRequests, }) { // If the site is hosted on wordpress.com, the API Route differs. // Same entity types are exposed (excepted for medias and users which need auth) @@ -127,6 +129,7 @@ async function fetch({ _hostingWPCOM, _auth, _accessToken, + concurrentRequests, }) ) if (_verbose) console.log(``) @@ -185,6 +188,7 @@ async function fetchData({ _hostingWPCOM, _auth, _accessToken, + concurrentRequests, }) { const type = route.type const url = route.url @@ -200,7 +204,7 @@ async function fetchData({ if (_verbose) console.time(`Fetching the ${type} took`) let routeResponse = await getPages( - { url, _perPage, _hostingWPCOM, _auth, _accessToken }, + { url, _perPage, _hostingWPCOM, _auth, _accessToken, getPages }, 1 ) @@ -263,7 +267,7 @@ async function fetchData({ * @returns */ async function getPages( - { url, _perPage, _hostingWPCOM, _auth, _accessToken, _verbose }, + { url, _perPage, _hostingWPCOM, _auth, _accessToken, _verbose, concurrentRequests }, page = 1 ) { try { @@ -313,18 +317,18 @@ async function getPages( } // We got page 1, now we want pages 2 through totalPages - const requests = _.range(2, totalPages + 1).map(getPage => { - const options = getOptions(getPage) - return axios(options) - }) + const pageOptions = _.range(2, totalPages + 1).map(getPage => getOptions(getPage)) - return Promise.all(requests).then(pages => { - const data = pages.map(page => page.data) - data.forEach(list => { - result = result.concat(list) - }) - return result + // using batchSize instead of concurrent for less overhead from better-queue + // the lib doesn't utilize cluster/child_process, so there isn't real concurrency + const pages = await requestInQueue(pageOptions, { batchSize: concurrentRequests }) + + const pageData = pages.map(page => page.data) + pageData.forEach(list => { + result = result.concat(list) }) + + return result } catch (e) { return httpExceptionHandler(e) } @@ -349,7 +353,6 @@ function getValidRoutes({ refactoredEntityTypes, }) { let validRoutes = [] - for (let key of Object.keys(allRoutes.data.routes)) { if (_verbose) console.log(`Route discovered :`, key) let route = allRoutes.data.routes[key] diff --git a/packages/gatsby-source-wordpress/src/gatsby-node.js b/packages/gatsby-source-wordpress/src/gatsby-node.js index 37ad22f50370f..1707c004bd963 100644 --- a/packages/gatsby-source-wordpress/src/gatsby-node.js +++ b/packages/gatsby-source-wordpress/src/gatsby-node.js @@ -28,6 +28,7 @@ exports.sourceNodes = async ( verboseOutput, perPage = 100, searchAndReplaceContentUrls = {}, + concurrentRequests = 10, } ) => { const { createNode } = boundActionCreators @@ -48,6 +49,7 @@ exports.sourceNodes = async ( _perPage, typePrefix, refactoredEntityTypes, + concurrentRequests, }) // Normalize data & create nodes diff --git a/packages/gatsby-source-wordpress/src/request-in-queue.js b/packages/gatsby-source-wordpress/src/request-in-queue.js new file mode 100644 index 0000000000000..79a5c70ab330a --- /dev/null +++ b/packages/gatsby-source-wordpress/src/request-in-queue.js @@ -0,0 +1,58 @@ +const Queue = require(`better-queue`) +const Promise = require(`bluebird`) +const request = require(`axios`) + +const _defaults = { + id: `url`, +} + +/** + * [handleQueue description] + * @param {[type]} task [description] + * @param {Function} cb [description] + * @return {[type]} [description] + */ +async function handleQueue(task, cb) { + try { + const response = await request(task) + cb(null, response) + } catch (err) { + cb(err) + } +} + +/** + * @typedef {Options} + * @type {Object} + * @see For a detailed descriptions of the tasks, + * see {@link https://www.npmjs.com/package/better-queue#full-documentation|better-queue on Github} + */ + +/** + * Run a series of requests tasks in a queue for better flow control + * + * @param {Object[]} tasks An array of Axios formatted request objects + * @param {Options} opts Options that will be given to better-queue + * @return {Promise} Resolves with the accumulated values from the tasks + */ +module.exports = function requestInQueue (tasks, opts) { + return new Promise((res, rej) => { + const q = new Queue(handleQueue, { ..._defaults, opts }) + + const taskMap = new Map(tasks.map((t) => { + q.push(t) + return [t.url, null] + })) + + q.on(`task_failed`, (id, err) => { + rej(`${id} failed with err: ${err}`) + q.destroy() + }) + + q.on(`task_finish`, (id, response) => { + taskMap.set(id, response) + }) + + q.on(`drain`, () => res(Array.from(taskMap.values()))) + }) +} From 3a3b569d4ff22d2a6223a56ed7adc6d26ff47b25 Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Tue, 27 Mar 2018 16:42:08 -0500 Subject: [PATCH 06/10] Revert files to master --- packages/gatsby-source-wordpress/README.md | 4 -- .../src/__tests__/request-in-queue.js | 49 ---------------- packages/gatsby-source-wordpress/src/fetch.js | 29 +++++----- .../src/gatsby-node.js | 2 - .../src/request-in-queue.js | 58 ------------------- 5 files changed, 13 insertions(+), 129 deletions(-) delete mode 100644 packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js delete mode 100644 packages/gatsby-source-wordpress/src/request-in-queue.js diff --git a/packages/gatsby-source-wordpress/README.md b/packages/gatsby-source-wordpress/README.md index a873c1d1fc53e..fb74b438842a6 100644 --- a/packages/gatsby-source-wordpress/README.md +++ b/packages/gatsby-source-wordpress/README.md @@ -88,10 +88,6 @@ plugins: [ sourceUrl: "https://source-url.com", replacementUrl: "https://replacement-url.com", }, - // How many requests should be sent out simultaneously. - // Lower this number if your blog is on a server with - // limitted resources. - concurrentRequests: 50 }, }, ]; diff --git a/packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js b/packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js deleted file mode 100644 index 4c1ffd9a8f9ce..0000000000000 --- a/packages/gatsby-source-wordpress/src/__tests__/request-in-queue.js +++ /dev/null @@ -1,49 +0,0 @@ -jest.mock(`axios`) - -const requestInQueue = require(`../request-in-queue`) -const axios = require(`axios`) - -axios.mockImplementation(opts => { - if (opts.throw) { throw new Error(opts.throw) } - - return opts.url.slice(opts.url.lastIndexOf(`/`) + 1) -}) - -describe(`requestInQueue`, () => { - let requests - - beforeEach(() => { - requests = [ - { method: `get`, url: `https://gatsbyjs.org/1` }, - { method: `get`, url: `https://gatsbyjs.org/2` }, - { method: `get`, url: `https://gatsbyjs.org/3` }, - { method: `get`, url: `https://gatsbyjs.org/4` }, - ] - }) - - afterEach(() => { - axios.mockClear() - }) - - it(`runs all requests in queue`, async () => { - await requestInQueue(requests) - - requests.forEach((req) => { - expect(axios).toHaveBeenCalledWith(req) - }) - }) - - it(`returns the values in the same order they were requested`, async () => { - const responses = await requestInQueue(requests) - expect(responses).toEqual([`1`, `2`, `3`, `4`]) - }) - - it(`stops any requests when one throws an error`, async () => { - try { - await requestInQueue([{ throw: `error` }, ...requests]) - } catch (err) { - expect(err).toBeDefined() - } - expect(axios).toHaveBeenCalledTimes(1) - }) -}) diff --git a/packages/gatsby-source-wordpress/src/fetch.js b/packages/gatsby-source-wordpress/src/fetch.js index 6724019b38518..39ee4a9181d7b 100644 --- a/packages/gatsby-source-wordpress/src/fetch.js +++ b/packages/gatsby-source-wordpress/src/fetch.js @@ -3,7 +3,6 @@ const axios = require(`axios`) const _ = require(`lodash`) const colorized = require(`./output-color`) const httpExceptionHandler = require(`./http-exception-handler`) -const requestInQueue = require(`./request-in-queue`) /** * High-level function to coordinate fetching data from a WordPress @@ -19,7 +18,6 @@ async function fetch({ baseUrl, typePrefix, refactoredEntityTypes, - concurrentRequests, }) { // If the site is hosted on wordpress.com, the API Route differs. // Same entity types are exposed (excepted for medias and users which need auth) @@ -129,7 +127,6 @@ async function fetch({ _hostingWPCOM, _auth, _accessToken, - concurrentRequests, }) ) if (_verbose) console.log(``) @@ -188,7 +185,6 @@ async function fetchData({ _hostingWPCOM, _auth, _accessToken, - concurrentRequests, }) { const type = route.type const url = route.url @@ -204,7 +200,7 @@ async function fetchData({ if (_verbose) console.time(`Fetching the ${type} took`) let routeResponse = await getPages( - { url, _perPage, _hostingWPCOM, _auth, _accessToken, getPages }, + { url, _perPage, _hostingWPCOM, _auth, _accessToken }, 1 ) @@ -267,7 +263,7 @@ async function fetchData({ * @returns */ async function getPages( - { url, _perPage, _hostingWPCOM, _auth, _accessToken, _verbose, concurrentRequests }, + { url, _perPage, _hostingWPCOM, _auth, _accessToken, _verbose }, page = 1 ) { try { @@ -317,18 +313,18 @@ async function getPages( } // We got page 1, now we want pages 2 through totalPages - const pageOptions = _.range(2, totalPages + 1).map(getPage => getOptions(getPage)) - - // using batchSize instead of concurrent for less overhead from better-queue - // the lib doesn't utilize cluster/child_process, so there isn't real concurrency - const pages = await requestInQueue(pageOptions, { batchSize: concurrentRequests }) - - const pageData = pages.map(page => page.data) - pageData.forEach(list => { - result = result.concat(list) + const requests = _.range(2, totalPages + 1).map(getPage => { + const options = getOptions(getPage) + return axios(options) }) - return result + return Promise.all(requests).then(pages => { + const data = pages.map(page => page.data) + data.forEach(list => { + result = result.concat(list) + }) + return result + }) } catch (e) { return httpExceptionHandler(e) } @@ -353,6 +349,7 @@ function getValidRoutes({ refactoredEntityTypes, }) { let validRoutes = [] + for (let key of Object.keys(allRoutes.data.routes)) { if (_verbose) console.log(`Route discovered :`, key) let route = allRoutes.data.routes[key] diff --git a/packages/gatsby-source-wordpress/src/gatsby-node.js b/packages/gatsby-source-wordpress/src/gatsby-node.js index 1707c004bd963..37ad22f50370f 100644 --- a/packages/gatsby-source-wordpress/src/gatsby-node.js +++ b/packages/gatsby-source-wordpress/src/gatsby-node.js @@ -28,7 +28,6 @@ exports.sourceNodes = async ( verboseOutput, perPage = 100, searchAndReplaceContentUrls = {}, - concurrentRequests = 10, } ) => { const { createNode } = boundActionCreators @@ -49,7 +48,6 @@ exports.sourceNodes = async ( _perPage, typePrefix, refactoredEntityTypes, - concurrentRequests, }) // Normalize data & create nodes diff --git a/packages/gatsby-source-wordpress/src/request-in-queue.js b/packages/gatsby-source-wordpress/src/request-in-queue.js deleted file mode 100644 index 79a5c70ab330a..0000000000000 --- a/packages/gatsby-source-wordpress/src/request-in-queue.js +++ /dev/null @@ -1,58 +0,0 @@ -const Queue = require(`better-queue`) -const Promise = require(`bluebird`) -const request = require(`axios`) - -const _defaults = { - id: `url`, -} - -/** - * [handleQueue description] - * @param {[type]} task [description] - * @param {Function} cb [description] - * @return {[type]} [description] - */ -async function handleQueue(task, cb) { - try { - const response = await request(task) - cb(null, response) - } catch (err) { - cb(err) - } -} - -/** - * @typedef {Options} - * @type {Object} - * @see For a detailed descriptions of the tasks, - * see {@link https://www.npmjs.com/package/better-queue#full-documentation|better-queue on Github} - */ - -/** - * Run a series of requests tasks in a queue for better flow control - * - * @param {Object[]} tasks An array of Axios formatted request objects - * @param {Options} opts Options that will be given to better-queue - * @return {Promise} Resolves with the accumulated values from the tasks - */ -module.exports = function requestInQueue (tasks, opts) { - return new Promise((res, rej) => { - const q = new Queue(handleQueue, { ..._defaults, opts }) - - const taskMap = new Map(tasks.map((t) => { - q.push(t) - return [t.url, null] - })) - - q.on(`task_failed`, (id, err) => { - rej(`${id} failed with err: ${err}`) - q.destroy() - }) - - q.on(`task_finish`, (id, response) => { - taskMap.set(id, response) - }) - - q.on(`drain`, () => res(Array.from(taskMap.values()))) - }) -} From b163b8963f22cef853fa26515322843faa22587f Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Tue, 27 Mar 2018 18:03:59 -0500 Subject: [PATCH 07/10] No longer throw an exception when an error occurs. Just resolve with null and move on --- .../src/create-remote-file-node.js | 83 ++++++++++--------- .../gatsby-source-wordpress/src/normalize.js | 1 + 2 files changed, 46 insertions(+), 38 deletions(-) diff --git a/packages/gatsby-source-filesystem/src/create-remote-file-node.js b/packages/gatsby-source-filesystem/src/create-remote-file-node.js index d8fc32b7cd3d5..fbc87fc301545 100644 --- a/packages/gatsby-source-filesystem/src/create-remote-file-node.js +++ b/packages/gatsby-source-filesystem/src/create-remote-file-node.js @@ -93,14 +93,14 @@ const createFilePath = (directory, filename, ext) => path.join( const queue = new Queue(pushToQueue, { id: `url`, merge: (old, _, cb) => cb(old), - batchSize: 200, + concurrent: 200, }) // Detetmine the max file descriptors on the users machine // Then set the batch size to be 3/4 of that becuase the user // will most likely have files open already getMaxFileLock().then((max) => { - queue.batchSize = max * .75 + queue.concurrent = max * .75 }) /** @@ -120,8 +120,12 @@ getMaxFileLock().then((max) => { * @return {Promise} */ async function pushToQueue (task, cb) { - const node = await processRemoteNode(task) - return cb(null, node) + try { + const node = await processRemoteNode(task) + return cb(null, node) + } catch (e) { + return cb(null, e) + } } @@ -141,21 +145,17 @@ async function pushToQueue (task, cb) { * @return {Promise} Resolves with the [http Result Object]{@link https://nodejs.org/api/http.html#http_class_http_serverresponse} */ const requestRemoteNode = (url, headers, tmpFilename, filename) => new Promise((resolve, reject) => { - let responseError = false - const responseStream = got.stream(url, headers) + const responseStream = got.stream(url, { ...headers, timeout: 30000 }) responseStream.pipe(fs.createWriteStream(tmpFilename)) responseStream.on(`downloadProgress`, pro => console.log(pro)) // If there's a 400/500 response or other error. responseStream.on(`error`, (error, body, response) => { - responseError = true fs.removeSync(tmpFilename) reject({ error, body, response }) }) - responseStream.on(`end`, response => { - if (responseError) return - + responseStream.on(`response`, response => { resolve(response) }) }) @@ -175,8 +175,8 @@ async function processRemoteNode ({ url, store, cache, createNode, auth = {} }) await fs.ensureDir( path.join( programDir, - `.cache`, - `gatsby-source-filesystem` + CACHE_DIR, + FS_PLUGIN_DIR ) ) @@ -203,30 +203,33 @@ async function processRemoteNode ({ url, store, cache, createNode, auth = {} }) const filename = createFilePath(programDir, digest, ext) // Fetch the file. - // Let the consumer handle thrown errors - const response = await requestRemoteNode(url, headers, tmpFilename, filename) - - // Save the response headers for future requests. - cache.set(cacheId(url), response.headers) - - // If the status code is 200, move the piped temp file to the real name. - if (response.statusCode === 200) { - await fs.move(tmpFilename, filename, { overwrite: true }) - // Else if 304, remove the empty response. - } else { - await fs.remove(tmpFilename) + try { + const response = await requestRemoteNode(url, headers, tmpFilename, filename) + // Save the response headers for future requests. + cache.set(cacheId(url), response.headers) + + // If the status code is 200, move the piped temp file to the real name. + if (response.statusCode === 200) { + await fs.move(tmpFilename, filename, { overwrite: true }) + // Else if 304, remove the empty response. + } else { + await fs.remove(tmpFilename) + } + + // Create the file node. + const fileNode = await createFileNode(filename, {}) + + // Override the default plugin as gatsby-source-filesystem needs to + // be the owner of File nodes or there'll be conflicts if any other + // File nodes are created through normal usages of + // gatsby-source-filesystem. + createNode(fileNode, { name: `gatsby-source-filesystem` }) + + return fileNode + } catch (err) { + // ignore } - - // Create the file node. - const fileNode = await createFileNode(filename, {}) - - // Override the default plugin as gatsby-source-filesystem needs to - // be the owner of File nodes or there'll be conflicts if any other - // File nodes are created through normal usages of - // gatsby-source-filesystem. - createNode(fileNode, { name: `gatsby-source-filesystem` }) - - return fileNode + return null } @@ -234,7 +237,6 @@ async function processRemoteNode ({ url, store, cache, createNode, auth = {} }) * Index of promises resolving to File node from remote url */ const processingCache = {} - /** * pushTask * -- @@ -247,8 +249,12 @@ const processingCache = {} const pushTask = (task) => new Promise((resolve, reject) => { queue .push(task) - .on(`finish`, resolve) - .on(`failed`, reject) + .on(`finish`, (task) => { + resolve(task) + }) + .on(`failed`, () => { + resolve() + }) }) @@ -275,6 +281,7 @@ module.exports = ({ url, store, cache, createNode, auth = {} }) => { } + if (!url || isWebUri(url) === undefined) { // should we resolve here, or reject? // Technically, it's invalid input diff --git a/packages/gatsby-source-wordpress/src/normalize.js b/packages/gatsby-source-wordpress/src/normalize.js index 59cdd1bc32b04..11f52a6dd323a 100644 --- a/packages/gatsby-source-wordpress/src/normalize.js +++ b/packages/gatsby-source-wordpress/src/normalize.js @@ -400,6 +400,7 @@ exports.downloadMediaFiles = async ({ }) } catch (e) { // Ignore + console.log(e) } } From 559fc0fc27cbd3168608d05bac52cabb2d46734a Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Tue, 27 Mar 2018 18:29:31 -0500 Subject: [PATCH 08/10] Remove file lock lookup for now. 200 concurrent requests is a safe number and we can look to change this in the future --- .../src/create-remote-file-node.js | 7 ------- .../src/get-max-file-lock.js | 20 ------------------- 2 files changed, 27 deletions(-) delete mode 100644 packages/gatsby-source-filesystem/src/get-max-file-lock.js diff --git a/packages/gatsby-source-filesystem/src/create-remote-file-node.js b/packages/gatsby-source-filesystem/src/create-remote-file-node.js index fbc87fc301545..08417cdf84fc2 100644 --- a/packages/gatsby-source-filesystem/src/create-remote-file-node.js +++ b/packages/gatsby-source-filesystem/src/create-remote-file-node.js @@ -6,7 +6,6 @@ const { isWebUri } = require(`valid-url`) const Queue = require(`better-queue`) const { createFileNode } = require(`./create-file-node`) -const getMaxFileLock = require(`./get-max-file-lock`) const cacheId = url => `create-remote-file-node-${url}` /******************** @@ -96,12 +95,6 @@ const queue = new Queue(pushToQueue, { concurrent: 200, }) -// Detetmine the max file descriptors on the users machine -// Then set the batch size to be 3/4 of that becuase the user -// will most likely have files open already -getMaxFileLock().then((max) => { - queue.concurrent = max * .75 -}) /** * @callback {Queue~queueCallback} diff --git a/packages/gatsby-source-filesystem/src/get-max-file-lock.js b/packages/gatsby-source-filesystem/src/get-max-file-lock.js deleted file mode 100644 index e7ab59fe1705b..0000000000000 --- a/packages/gatsby-source-filesystem/src/get-max-file-lock.js +++ /dev/null @@ -1,20 +0,0 @@ -const { spawn } = require(`child_process`) - -module.exports = function () { - return new Promise((resolve, reject) => { - const cp = spawn(`ulimit`, [`-n`]) - - let output = `` - cp.stdout.setEncoding(`utf8`) - - cp.stdout.on(`data`, data => output += data) - cp.stdout.on(`exit`, code => { - // If the ulimit command fails, or doesn't exist in the environment, - // fall back to a safe number - // This will most likely happen on Windows - if (code != 0) { resolve(256) } - - resolve(parseInt(output, 10)) - }) - }) -} From be11702c5bf97172bf0f99e1727abdc342936e15 Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Mon, 2 Apr 2018 15:20:04 -0500 Subject: [PATCH 09/10] Cosmoetic updates --- .../src/create-remote-file-node.js | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/packages/gatsby-source-filesystem/src/create-remote-file-node.js b/packages/gatsby-source-filesystem/src/create-remote-file-node.js index 08417cdf84fc2..f5af5da0b1526 100644 --- a/packages/gatsby-source-filesystem/src/create-remote-file-node.js +++ b/packages/gatsby-source-filesystem/src/create-remote-file-node.js @@ -30,7 +30,7 @@ const cacheId = url => `create-remote-file-node-${url}` */ /** - * @typedef {CRFNPayload} + * @typedef {CreateRemoteFileNodePayload} * @typedef {Object} * @description Create Remote File Node Payload * @@ -58,7 +58,6 @@ const createHash = (str) => crypto .update(str) .digest(`hex`) - const CACHE_DIR = `.cache` const FS_PLUGIN_DIR = `gatsby-source-filesystem` @@ -78,7 +77,6 @@ const createFilePath = (directory, filename, ext) => path.join( `${filename}${ext}` ) - /******************** * Queue Management * ********************/ @@ -95,7 +93,6 @@ const queue = new Queue(pushToQueue, { concurrent: 200, }) - /** * @callback {Queue~queueCallback} * @param {*} error @@ -108,7 +105,7 @@ const queue = new Queue(pushToQueue, { * Handle tasks that are pushed in to the Queue * * - * @param {CRFNPayload} task + * @param {CreateRemoteFileNodePayload} task * @param {Queue~queueCallback} cb * @return {Promise} */ @@ -121,7 +118,6 @@ async function pushToQueue (task, cb) { } } - /****************** * Core Functions * ******************/ @@ -153,13 +149,12 @@ const requestRemoteNode = (url, headers, tmpFilename, filename) => new Promise(( }) }) - /** * processRemoteNode * -- * Request the remote file and return the fileNode * - * @param {CRFNPayload} options + * @param {CreateRemoteFileNodePayload} options * @return {Promise} Resolves with the fileNode */ async function processRemoteNode ({ url, store, cache, createNode, auth = {} }) { @@ -225,7 +220,6 @@ async function processRemoteNode ({ url, store, cache, createNode, auth = {} }) return null } - /** * Index of promises resolving to File node from remote url */ @@ -236,7 +230,7 @@ const processingCache = {} * pushes a task in to the Queue and the processing cache * * Promisfy a task in queue - * @param {CRFNPayload} task + * @param {CreateRemoteFileNodePayload} task * @return {Promise} */ const pushTask = (task) => new Promise((resolve, reject) => { @@ -250,7 +244,6 @@ const pushTask = (task) => new Promise((resolve, reject) => { }) }) - /*************** * Entry Point * ***************/ @@ -263,7 +256,7 @@ const pushTask = (task) => new Promise((resolve, reject) => { * First checks cache to ensure duplicate requests aren't processed * Then pushes to a queue * - * @param {CRFNPayload} options + * @param {CreateRemoteFileNodePayload} options * @return {Promise} Returns the created node */ module.exports = ({ url, store, cache, createNode, auth = {} }) => { @@ -274,7 +267,6 @@ module.exports = ({ url, store, cache, createNode, auth = {} }) => { } - if (!url || isWebUri(url) === undefined) { // should we resolve here, or reject? // Technically, it's invalid input From cc7fa550bb48afd87325e70e35a160cd61b92244 Mon Sep 17 00:00:00 2001 From: TJ Simons Date: Mon, 2 Apr 2018 15:21:25 -0500 Subject: [PATCH 10/10] Remove console.log --- packages/gatsby-source-wordpress/src/normalize.js | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/gatsby-source-wordpress/src/normalize.js b/packages/gatsby-source-wordpress/src/normalize.js index 371ea7d01b0e5..35f42a453c60c 100644 --- a/packages/gatsby-source-wordpress/src/normalize.js +++ b/packages/gatsby-source-wordpress/src/normalize.js @@ -409,7 +409,6 @@ exports.downloadMediaFiles = async ({ }) } catch (e) { // Ignore - console.log(e) } }