From d976126a23c13cd15dc5f69a16134823c60d645b Mon Sep 17 00:00:00 2001 From: Tom Arrow Date: Fri, 8 Apr 2016 19:11:05 +0100 Subject: [PATCH] Factor out downloading --- lib/arxiv.js | 113 +++++++----------------------------------------- lib/download.js | 94 ++++++++++++++++++++++++++++++++++++++++ lib/eupmc.js | 69 +++-------------------------- 3 files changed, 116 insertions(+), 160 deletions(-) create mode 100644 lib/download.js diff --git a/lib/arxiv.js b/lib/arxiv.js index 20b273f..0af2499 100644 --- a/lib/arxiv.js +++ b/lib/arxiv.js @@ -5,7 +5,8 @@ var rest = require('restler') , got = require('got') , mkdirp = require('mkdirp') , _ = require('lodash') -, ProgressBar = require('progress'); +, ProgressBar = require('progress') +, urlDl = require('./download.js'); var ArXiv = function(opts) { @@ -76,7 +77,7 @@ ArXiv.prototype.completeCallback = function(data) { log.info('Limiting to ' + arxiv.hitlimit + ' hits'); } else { arxiv.hitlimit = arxiv.hitcount; } - + // create progress bar var progmsg = 'Retrieving results [:bar] :percent' + ' (eta :etas)'; @@ -200,9 +201,10 @@ ArXiv.prototype.getFulltextPDFUrl = function(result) { }); if (pdfurls.length == 0) { + //log.info('pdf missing') return null; } else { - return pdfurls[0]['$'].href; + return [ pdfurls[0]['$'].href, result.id[0].split('abs/')[1] + '/' ]; } } @@ -218,10 +220,16 @@ ArXiv.prototype.getSuppFilesUrl = function(result) { var id = arxiv.getIdentifier(result); - return id.split('abs').join('e-print'); + return [id.split('abs').join('e-print'), id.split('abs/')[1]]; } +ArXiv.prototype.urlQueueBuilder = function(urls, type, rename) { + return urls.map(function urlQueueBuilder(url_id) { + return {url: url_id[0], id: url_id[1], type: type, rename: rename } + }) +}; + ArXiv.prototype.downloadFulltextPDFs = function(arxiv) { urls = arxiv.allresults @@ -230,25 +238,9 @@ ArXiv.prototype.downloadFulltextPDFs = function(arxiv) { log.info('Downloading fulltext PDF files'); - var failed = []; - var retries = 0; - - var done = _.after(urls.length, function() { - if (failed.length > 0 && retries == 0) { - log.warn(failed.length + ' downloads timed out. Retrying.'); - failed = []; - arxiv.downloadUrls(urls, 'PDF', 'fulltext.pdf', - failed, done, arxiv); - } else if (failed.length > 0) { - log.warn(failed.length + ' downloads timed on retry. Skipping.'); - } else { - log.info('All PDF downloads succeeded!'); - } - arxiv.nextDlTask(); - }); + var urlQueue = arxiv.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf'); + urlDl.downloadURLs(urlQueue, arxiv.nextDlTask.bind(arxiv)); - arxiv.downloadUrls(urls, 'PDF', 'fulltext.pdf', - failed, done, arxiv); } ArXiv.prototype.downloadSuppFiles = function(arxiv) { @@ -259,81 +251,8 @@ ArXiv.prototype.downloadSuppFiles = function(arxiv) { log.info('Downloading supplementary files'); - var failed = []; - var retries = 0; - var missing = 0; - - var fourohfour = function() { - missing ++; - } - - var done = _.after(urls.length, function() { - if (failed.length > 0 && retries == 0) { - log.warn(failed.length + ' downloads timed out. Retrying.'); - failed = []; - arxiv.downloadUrls(urls, - 'supplementary files', - 'supplementaryFiles.tar.gz', - failed, done, arxiv, fourohfour); - } else if (failed.length > 0) { - log.warn(failed.length + ' downloads timed on retry. Skipping.'); - } else if (missing > 0) { - var succeeded = urls.length - missing; - var suffix = missing > 1 ? 's' : '' - log.info(succeeded + ' downloads succeeded. ' + missing + - ' paper' + suffix + ' had no supplementary files.'); - } else { - log.info('All supplementary file downloads succeeded!'); - } - arxiv.nextDlTask(); - }); - - arxiv.downloadUrls(urls, - 'supplementary files', - 'supplementaryFiles.tar.gz', - failed, done, arxiv, fourohfour); -} - -ArXiv.prototype.downloadUrls = function(urls, type, rename, failed, - cb, thisArg, fourohfour) { - - var arxiv = thisArg; - - // setup progress bar - var progmsg = 'Downloading files [:bar] :percent' + - ' (:current/:total) [:elapseds elapsed, eta :eta]'; - var progopts = { - total: urls.length, - width: 30, - complete: chalk.green('=') - }; - var dlprogress = new ProgressBar(progmsg, progopts); - - urls.forEach(function(url, i) { - var base = arxiv.getIdentifier(arxiv.allresults[i]).split('abs/')[1] + '/'; - log.debug('Creating directory: ' + base); - mkdirp.sync(base); - log.debug('Downloading ' + type + ': ' + url); - var options = { - timeout: 40000, - encoding: null - } - var get = got(url, options, function(err, data, res) { - dlprogress.tick(); - if (err) { - if (!res) { - failed.push(url); - } else if ((res.statusCode == 404) && !(fourohfour === null)) { - fourohfour(); - } else { - failed.push(url); - } - cb(); - } else { - fs.writeFile(base + rename, data, cb); - } - }); - }); + var urlQueue = arxiv.urlQueueBuilder(urls, 'supplementary files', 'supplementaryFiles.tar.gz'); + urlDl.downloadURLs(urlQueue, arxiv.nextDlTask.bind(arxiv)); } module.exports = ArXiv; diff --git a/lib/download.js b/lib/download.js new file mode 100644 index 0000000..d793736 --- /dev/null +++ b/lib/download.js @@ -0,0 +1,94 @@ +var util = require('util') +, fs = require('fs') +, chalk = require('chalk') +, got = require('got') +, mkdirp = require('mkdirp') +, _ = require('lodash') +, ProgressBar = require('progress'); + +exports.downloadURLs = function(fullurlQueue, nextDlTaskcb) { + var failed = []; + var retries = 0; + var missing = 0; + + urlQueue = fullurlQueue; //urlQueue needs to be global unless + //we put these other functions inside + //this one. + + //Setup ProgressBar + var progmsg = 'Downloading files [:bar] :percent' + + ' (:current/:total) [:elapseds elapsed, eta :eta]'; + var progopts = { + total: fullurlQueue.length, + width: 30, + complete: chalk.green('=') + }; + var dlprogress = new ProgressBar(progmsg, progopts); + + for(i=0; i<10; i++) { + nextUrlTask(urlQueue); //spawn 10 workers + } + +function nextUrlTask() { + if (urlQueue instanceof Array && urlQueue.length > 0) { + var urlObj = urlQueue.splice(0,1)[0]; + downloadURL(urlObj); + } +} + +function downloadURL(urlObj) { + var url = urlObj.url; + var id = urlObj.id; + var type = urlObj.type; + var rename = urlObj.rename; + var base = id + '/'; + log.debug('Creating directory: ' + base); + mkdirp.sync(base); + log.debug('Downloading ' + type + ': ' + url); + var options = { + timeout: 15000, + encoding: null, + retries: 3 + } + + var get = got(url, options, function(err, data, res) { + dlprogress.tick(); + if (err) { + if (err.code === 'ETIMEDOUT' || err.code === 'ESOCKETTIMEDOUT') { + log.warn('Download timed out for URL ' + url); + } + if (!res) { + failed.push(url); + } else if ((res.statusCode == 404) && !(fourohfour === null)) { + fourohfour(); + } else { + failed.push(url); + } + done(); + } else { + fs.writeFile(base + rename, data, done); + } + nextUrlTask(urlQueue); + }); + } + + var donefunc = function() { + if (failed.length > 0) { + log.warn(failed.length + ' downloads timed out on retry.'); + } else if (missing > 0) { + var succeeded = urls.length - missing; + var suffix = missing > 1 ? 's' : '' + log.info(succeeded + ' downloads succeeded. ' + missing + + ' paper' + suffix + ' had URLs that could not be reached (404 error).'); + } else { + log.info('All downloads succeeded!'); + } + nextDlTaskcb(); + } + + var done = _.after(urls.length, donefunc); + + var fourohfour = function() { + missing ++; + } +} diff --git a/lib/eupmc.js b/lib/eupmc.js index dd3b364..3ec79f9 100644 --- a/lib/eupmc.js +++ b/lib/eupmc.js @@ -5,7 +5,8 @@ var rest = require('restler') , got = require('got') , mkdirp = require('mkdirp') , _ = require('lodash') -, ProgressBar = require('progress'); +, ProgressBar = require('progress') +, urlDl = require('./download.js'); var EuPmc = function(opts) { @@ -232,35 +233,8 @@ EuPmc.prototype.downloadFulltextXMLs = function(eupmc) { log.info('Downloading fulltext XML files'); - var failed = []; - var retries = 0; - var missing = 0; - - var fourohfour = function() { - missing ++; - } - - var done = _.after(urls.length, function() { - if (failed.length > 0 && retries == 0) { - log.warn(failed.length + ' downloads timed out. Retrying.'); - failed = []; - eupmc.downloadUrls(urls, 'XML', 'fulltext.html', - failed, done, eupmc, fourohfour); - } else if (failed.length > 0) { - log.warn(failed.length + ' downloads timed out on retry. Skipping.'); - } else if (missing > 0) { - var succeeded = urls.length - missing; - var suffix = missing > 1 ? 's' : '' - log.info(succeeded + ' downloads succeeded. ' + missing + - ' paper' + suffix + ' had XML URLs that could not be reached (404 error).'); - } else { - log.info('All XML downloads succeeded!'); - } - eupmc.nextDlTask(); - }); - - eupmc.downloadUrls(urls, 'XML', 'fulltext.xml', - failed, done, eupmc, fourohfour); + var urlQueue = eupmc.urlQueueBuilder(urls, 'XML', 'fulltext.xml'); + urlDl.downloadURLs(urlQueue, eupmc.nextDlTask.bind(eupmc)); } EuPmc.prototype.downloadFulltextPDFs = function(eupmc) { @@ -271,39 +245,8 @@ EuPmc.prototype.downloadFulltextPDFs = function(eupmc) { log.info('Downloading fulltext PDF files'); - var failed = []; - var retries = 0; - var missing = 0; - - var fourohfour = function() { - missing ++; - } - - var donefunc = function() { - if (failed.length > 0 && retries == 0) { - retries += 1; - log.warn(failed.length + ' downloads timed out. Retrying.'); - failed = []; - var done = _.after(failed.length, donefunc); - eupmc.downloadUrls(urls, 'PDF', 'fulltext.pdf', - failed, done, eupmc, fourohfour); - } else if (failed.length > 0) { - log.warn(failed.length + ' downloads timed out on retry. Skipping.'); - } else if (missing > 0) { - var succeeded = urls.length - missing; - var suffix = missing > 1 ? 's' : '' - log.info(succeeded + ' downloads succeeded. ' + missing + - ' paper' + suffix + ' had PDF URLs that could not be reached (404 error).'); - } else { - log.info('All PDF downloads succeeded!'); - } - eupmc.nextDlTask(); - } - - var done = _.after(urls.length, donefunc); - - eupmc.downloadUrls(urls, 'PDF', 'fulltext.pdf', - failed, done, eupmc, fourohfour); + var urlQueue = eupmc.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf'); + urlDl.downloadURLs(urlQueue, eupmc.nextDlTask.bind(eupmc)); } EuPmc.prototype.downloadSuppFiles = function(eupmc) {