Skip to content

Commit

Permalink
Merge pull request #87 from tarrow/updateurldownloader
Browse files Browse the repository at this point in the history
Downloads refactor - use a common downloader across all APIs
  • Loading branch information
Richard Smith-Unna committed May 1, 2016
2 parents 949a6a2 + d976126 commit d6052ba
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 160 deletions.
113 changes: 16 additions & 97 deletions lib/arxiv.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ var rest = require('restler')
, got = require('got')
, mkdirp = require('mkdirp')
, _ = require('lodash')
, ProgressBar = require('progress');
, ProgressBar = require('progress')
, urlDl = require('./download.js');

var ArXiv = function(opts) {

Expand Down Expand Up @@ -76,7 +77,7 @@ ArXiv.prototype.completeCallback = function(data) {
log.info('Limiting to ' + arxiv.hitlimit + ' hits');
}
else { arxiv.hitlimit = arxiv.hitcount; }

// create progress bar
var progmsg = 'Retrieving results [:bar] :percent' +
' (eta :etas)';
Expand Down Expand Up @@ -200,9 +201,10 @@ ArXiv.prototype.getFulltextPDFUrl = function(result) {
});

if (pdfurls.length == 0) {
//log.info('pdf missing')
return null;
} else {
return pdfurls[0]['$'].href;
return [ pdfurls[0]['$'].href, result.id[0].split('abs/')[1] + '/' ];
}

}
Expand All @@ -218,10 +220,16 @@ ArXiv.prototype.getSuppFilesUrl = function(result) {
var id = arxiv.getIdentifier(result);


return id.split('abs').join('e-print');
return [id.split('abs').join('e-print'), id.split('abs/')[1]];

}

ArXiv.prototype.urlQueueBuilder = function(urls, type, rename) {
return urls.map(function urlQueueBuilder(url_id) {
return {url: url_id[0], id: url_id[1], type: type, rename: rename }
})
};

ArXiv.prototype.downloadFulltextPDFs = function(arxiv) {

urls = arxiv.allresults
Expand All @@ -230,25 +238,9 @@ ArXiv.prototype.downloadFulltextPDFs = function(arxiv) {

log.info('Downloading fulltext PDF files');

var failed = [];
var retries = 0;

var done = _.after(urls.length, function() {
if (failed.length > 0 && retries == 0) {
log.warn(failed.length + ' downloads timed out. Retrying.');
failed = [];
arxiv.downloadUrls(urls, 'PDF', 'fulltext.pdf',
failed, done, arxiv);
} else if (failed.length > 0) {
log.warn(failed.length + ' downloads timed on retry. Skipping.');
} else {
log.info('All PDF downloads succeeded!');
}
arxiv.nextDlTask();
});
var urlQueue = arxiv.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf');
urlDl.downloadURLs(urlQueue, arxiv.nextDlTask.bind(arxiv));

arxiv.downloadUrls(urls, 'PDF', 'fulltext.pdf',
failed, done, arxiv);
}

ArXiv.prototype.downloadSuppFiles = function(arxiv) {
Expand All @@ -259,81 +251,8 @@ ArXiv.prototype.downloadSuppFiles = function(arxiv) {

log.info('Downloading supplementary files');

var failed = [];
var retries = 0;
var missing = 0;

var fourohfour = function() {
missing ++;
}

var done = _.after(urls.length, function() {
if (failed.length > 0 && retries == 0) {
log.warn(failed.length + ' downloads timed out. Retrying.');
failed = [];
arxiv.downloadUrls(urls,
'supplementary files',
'supplementaryFiles.tar.gz',
failed, done, arxiv, fourohfour);
} else if (failed.length > 0) {
log.warn(failed.length + ' downloads timed on retry. Skipping.');
} else if (missing > 0) {
var succeeded = urls.length - missing;
var suffix = missing > 1 ? 's' : ''
log.info(succeeded + ' downloads succeeded. ' + missing +
' paper' + suffix + ' had no supplementary files.');
} else {
log.info('All supplementary file downloads succeeded!');
}
arxiv.nextDlTask();
});

arxiv.downloadUrls(urls,
'supplementary files',
'supplementaryFiles.tar.gz',
failed, done, arxiv, fourohfour);
}

ArXiv.prototype.downloadUrls = function(urls, type, rename, failed,
cb, thisArg, fourohfour) {

var arxiv = thisArg;

// setup progress bar
var progmsg = 'Downloading files [:bar] :percent' +
' (:current/:total) [:elapseds elapsed, eta :eta]';
var progopts = {
total: urls.length,
width: 30,
complete: chalk.green('=')
};
var dlprogress = new ProgressBar(progmsg, progopts);

urls.forEach(function(url, i) {
var base = arxiv.getIdentifier(arxiv.allresults[i]).split('abs/')[1] + '/';
log.debug('Creating directory: ' + base);
mkdirp.sync(base);
log.debug('Downloading ' + type + ': ' + url);
var options = {
timeout: 40000,
encoding: null
}
var get = got(url, options, function(err, data, res) {
dlprogress.tick();
if (err) {
if (!res) {
failed.push(url);
} else if ((res.statusCode == 404) && !(fourohfour === null)) {
fourohfour();
} else {
failed.push(url);
}
cb();
} else {
fs.writeFile(base + rename, data, cb);
}
});
});
var urlQueue = arxiv.urlQueueBuilder(urls, 'supplementary files', 'supplementaryFiles.tar.gz');
urlDl.downloadURLs(urlQueue, arxiv.nextDlTask.bind(arxiv));
}

module.exports = ArXiv;
94 changes: 94 additions & 0 deletions lib/download.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
var util = require('util')
, fs = require('fs')
, chalk = require('chalk')
, got = require('got')
, mkdirp = require('mkdirp')
, _ = require('lodash')
, ProgressBar = require('progress');

exports.downloadURLs = function(fullurlQueue, nextDlTaskcb) {
var failed = [];
var retries = 0;
var missing = 0;

urlQueue = fullurlQueue; //urlQueue needs to be global unless
//we put these other functions inside
//this one.

//Setup ProgressBar
var progmsg = 'Downloading files [:bar] :percent' +
' (:current/:total) [:elapseds elapsed, eta :eta]';
var progopts = {
total: fullurlQueue.length,
width: 30,
complete: chalk.green('=')
};
var dlprogress = new ProgressBar(progmsg, progopts);

for(i=0; i<10; i++) {
nextUrlTask(urlQueue); //spawn 10 workers
}

function nextUrlTask() {
if (urlQueue instanceof Array && urlQueue.length > 0) {
var urlObj = urlQueue.splice(0,1)[0];
downloadURL(urlObj);
}
}

function downloadURL(urlObj) {
var url = urlObj.url;
var id = urlObj.id;
var type = urlObj.type;
var rename = urlObj.rename;
var base = id + '/';
log.debug('Creating directory: ' + base);
mkdirp.sync(base);
log.debug('Downloading ' + type + ': ' + url);
var options = {
timeout: 15000,
encoding: null,
retries: 3
}

var get = got(url, options, function(err, data, res) {
dlprogress.tick();
if (err) {
if (err.code === 'ETIMEDOUT' || err.code === 'ESOCKETTIMEDOUT') {
log.warn('Download timed out for URL ' + url);
}
if (!res) {
failed.push(url);
} else if ((res.statusCode == 404) && !(fourohfour === null)) {
fourohfour();
} else {
failed.push(url);
}
done();
} else {
fs.writeFile(base + rename, data, done);
}
nextUrlTask(urlQueue);
});
}

var donefunc = function() {
if (failed.length > 0) {
log.warn(failed.length + ' downloads timed out on retry.');
} else if (missing > 0) {
var succeeded = urls.length - missing;
var suffix = missing > 1 ? 's' : ''
log.info(succeeded + ' downloads succeeded. ' + missing +
' paper' + suffix + ' had URLs that could not be reached (404 error).');
} else {
log.info('All downloads succeeded!');
}
nextDlTaskcb();
}

var done = _.after(urls.length, donefunc);

var fourohfour = function() {
missing ++;
}
}
69 changes: 6 additions & 63 deletions lib/eupmc.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ var rest = require('restler')
, got = require('got')
, mkdirp = require('mkdirp')
, _ = require('lodash')
, ProgressBar = require('progress');
, ProgressBar = require('progress')
, urlDl = require('./download.js');

var EuPmc = function(opts) {

Expand Down Expand Up @@ -232,35 +233,8 @@ EuPmc.prototype.downloadFulltextXMLs = function(eupmc) {

log.info('Downloading fulltext XML files');

var failed = [];
var retries = 0;
var missing = 0;

var fourohfour = function() {
missing ++;
}

var done = _.after(urls.length, function() {
if (failed.length > 0 && retries == 0) {
log.warn(failed.length + ' downloads timed out. Retrying.');
failed = [];
eupmc.downloadUrls(urls, 'XML', 'fulltext.html',
failed, done, eupmc, fourohfour);
} else if (failed.length > 0) {
log.warn(failed.length + ' downloads timed out on retry. Skipping.');
} else if (missing > 0) {
var succeeded = urls.length - missing;
var suffix = missing > 1 ? 's' : ''
log.info(succeeded + ' downloads succeeded. ' + missing +
' paper' + suffix + ' had XML URLs that could not be reached (404 error).');
} else {
log.info('All XML downloads succeeded!');
}
eupmc.nextDlTask();
});

eupmc.downloadUrls(urls, 'XML', 'fulltext.xml',
failed, done, eupmc, fourohfour);
var urlQueue = eupmc.urlQueueBuilder(urls, 'XML', 'fulltext.xml');
urlDl.downloadURLs(urlQueue, eupmc.nextDlTask.bind(eupmc));
}

EuPmc.prototype.downloadFulltextPDFs = function(eupmc) {
Expand All @@ -271,39 +245,8 @@ EuPmc.prototype.downloadFulltextPDFs = function(eupmc) {

log.info('Downloading fulltext PDF files');

var failed = [];
var retries = 0;
var missing = 0;

var fourohfour = function() {
missing ++;
}

var donefunc = function() {
if (failed.length > 0 && retries == 0) {
retries += 1;
log.warn(failed.length + ' downloads timed out. Retrying.');
failed = [];
var done = _.after(failed.length, donefunc);
eupmc.downloadUrls(urls, 'PDF', 'fulltext.pdf',
failed, done, eupmc, fourohfour);
} else if (failed.length > 0) {
log.warn(failed.length + ' downloads timed out on retry. Skipping.');
} else if (missing > 0) {
var succeeded = urls.length - missing;
var suffix = missing > 1 ? 's' : ''
log.info(succeeded + ' downloads succeeded. ' + missing +
' paper' + suffix + ' had PDF URLs that could not be reached (404 error).');
} else {
log.info('All PDF downloads succeeded!');
}
eupmc.nextDlTask();
}

var done = _.after(urls.length, donefunc);

eupmc.downloadUrls(urls, 'PDF', 'fulltext.pdf',
failed, done, eupmc, fourohfour);
var urlQueue = eupmc.urlQueueBuilder(urls, 'PDF', 'fulltext.pdf');
urlDl.downloadURLs(urlQueue, eupmc.nextDlTask.bind(eupmc));
}

EuPmc.prototype.downloadSuppFiles = function(eupmc) {
Expand Down

0 comments on commit d6052ba

Please sign in to comment.