Skip to content

Commit

Permalink
remove cache feature and use url normalizer to detect duplicate url
Browse files Browse the repository at this point in the history
  • Loading branch information
mike442144 committed Jul 29, 2015
1 parent c90c812 commit 7602576
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 48 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ After [installing Docker](http://docs.docker.com/), you can run:
Rough todolist
--------------

* Using bottleneck to deal with rate limits
* Introducing zombie to deal with page with complex ajax
* Refactoring the code to be more maintenable, it's spaghetti code in there !
* Have a look at the Cache feature and refactor it
* Same for the Pool
Expand Down
62 changes: 18 additions & 44 deletions lib/crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ var request = require('request');
var _ = require('lodash');
var jschardet = require('jschardet');
var cheerio = require('cheerio');
var zlib = require('zlib');
var fs = require('fs');
var Pool = require('generic-pool').Pool;
var contentType = require('content-type');
var seenreq = require('seenreq');
var logger=null;
// Fallback on iconv-lite if we didn't succeed compiling iconv
// https://github.com/sylvinus/node-crawler/pull/29
Expand All @@ -27,8 +27,7 @@ if (!iconv) {
function useCache (options) {
return (
options.uri &&
(options.cache || options.skipDuplicates) &&
(options.method === 'GET' || options.method === 'HEAD'));
(options.cache || options.skipDuplicates));
}

function checkJQueryNaming (options) {
Expand Down Expand Up @@ -109,6 +108,7 @@ Crawler.prototype.init = function init (options) {
self.queueItemSize = 0;

self.cache = {};
self.seen = new seenreq();
logger = self.options.logger || console;

self.on('pool:release', function(options) {
Expand Down Expand Up @@ -205,7 +205,7 @@ Crawler.prototype.queue = function queue (options) {
_.isString(options[0]) ? { uri: options[0] } : options[0]
);
// Did you get multiple requests? Queue the URLs.
} else if (!_.isNull(options) && !_.isUndefined(options)) {
} else if (!_.isNull(options) && !_.isUndefined(options) && options.length>0) {
self.queue(
_.isString(options[0]) ? { uri: options[0] } : options[0]
);
Expand All @@ -229,7 +229,7 @@ Crawler.prototype._pushToQueue = function _pushToQueue (options) {
});

// If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled
if (options.skipDuplicates && self.cache[options.uri]) {
if (options.skipDuplicates && self.seen.exists(options)) {
return self.emit('pool:release', options);
}

Expand All @@ -244,10 +244,10 @@ Crawler.prototype._pushToQueue = function _pushToQueue (options) {
options.callback(error);
return;
}

//Static HTML was given, skip request
if (options.html) {
self._onContent(null, options, {body:options.html}, false);
self._onContent(null, options, {body:options.html});
} else if (typeof options.uri === 'function') {
options.uri( function(uri) {
options.uri = uri;
Expand All @@ -264,38 +264,18 @@ Crawler.prototype._makeCrawlerRequest = function _makeCrawlerRequest (options) {

if (typeof options.rateLimits === 'number' && options.rateLimits !== 0) {
setTimeout(function() {
self._executeCrawlerRequest(options);
self._buildHttpRequest(options);//self._executeCrawlerRequest(options);
}, options.rateLimits);
} else {
self._executeCrawlerRequest(options);
}
};

Crawler.prototype._executeCrawlerRequest = function _executeCrawlerRequest (options) {
var self = this;
var cacheData = self.cache[options.uri];

//If a query has already been made to self URL, don't callback again
if (useCache(options) && cacheData) {

// Make sure we actually have cached data, and not just a note
// that the page was already crawled
if (_.isArray(cacheData)) {
self._onContent(null, options, cacheData[0], true);
} else {
self.emit('pool:release', options);
}

} else {
self._buildHttpRequest(options);
self._buildHttpRequest(options);//self._executeCrawlerRequest(options);
}
};

Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
var self = this;

if (options.debug) {
logger.info(options.method+' '+options.uri+' ...');
logger.info(options.method+' '+options.uri+"?"+require("querystring").stringify(options.qs));
}

// Cloning keeps the opts parameter clean:
Expand All @@ -311,6 +291,7 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
if (!ropts.headers['Accept-Charset'] && !ropts.headers['accept-charset']) {
ropts.headers['Accept-Charset'] = 'utf-8;q=0.7,*;q=0.3';
}

if (!ropts.encoding) {
ropts.encoding=null;
}
Expand Down Expand Up @@ -352,7 +333,7 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
}

response.uri = response.request.href;
self._onContent(error,options,response,false);
self._onContent(error,options,response);
});
};

Expand Down Expand Up @@ -390,7 +371,10 @@ Crawler.prototype._onContent = function _onContent (error, options, response, fr
logger.info('Got '+(options.uri||'html')+' ('+response.body.length+' bytes)...');
}

if (options.forceUTF8) {
if(options.encoding === null){
response.body = response.body
}
else if (options.forceUTF8) {
//TODO check http header or meta equiv?
var iconvObj;

Expand Down Expand Up @@ -463,24 +447,14 @@ Crawler.prototype._onContent = function _onContent (error, options, response, fr
response.body = response.body.toString();
}

if (useCache(options) && !fromCache) {
if (options.cache) {
self.cache[options.uri] = [response];

//If we don't cache but still want to skip duplicates we have to maintain a list of fetched URLs.
} else if (options.skipDuplicates) {
self.cache[options.uri] = true;
}
}

if (!options.callback) {
return self.emit('pool:release', options);
}

response.options = options;

// This could definitely be improved by *also* matching content-type headers
var isHTML = response.body.match(/^\s*</);
var isHTML = _.isString(response.body) && response.body.match(/^\s*</);

if (isHTML && options.jQuery && options.method !== 'HEAD') {
self._inject(response, options, function(errors, $) {
Expand All @@ -500,4 +474,4 @@ Crawler.prototype._onInject = function _onInject (errors, options, response, $)
};

module.exports = Crawler;
module.exports.VERSION = '0.4.9';
module.exports.VERSION = '0.4.14';
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "node-webcrawler",
"version": "0.4.9",
"version": "0.4.14",
"description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously",
"main": "./lib/crawler.js",
"directories": {
Expand All @@ -21,7 +21,8 @@
"iconv-lite": "0.4.8",
"jschardet": "1.1.1",
"lodash": "3.8.0",
"request": "2.55.0"
"request": "2.55.0",
"seenreq":"0.0.5"
},
"optionalDependencies": {
"iconv": "*"
Expand Down
13 changes: 13 additions & 0 deletions tests/encoding.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,19 @@ describe('Encoding', function() {
}
}]);
});
it('should return buffer if encoding = null', function(done) {
this.timeout(5000);
c.queue([{
uri: 'http://czyborra.com/charsets/iso8859.html',
encoding:null,
callback: function(error, result) //noinspection BadExpressionStatementJS,BadExpressionStatementJS
{
expect(error).to.be.null;
expect(result.body instanceof Buffer).to.be.true;
done();
}
}]);
});
});


Expand Down
4 changes: 2 additions & 2 deletions tests/uriOption.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,6 @@ describe('Uri Options', function() {
}
});
spy = sinon.spy(c, '_pushToQueue');
c.queue([undefined, 'http://'+httpbinHost]);
c.queue([undefined,null,[], 'http://'+httpbinHost]);
});
});
});

0 comments on commit 7602576

Please sign in to comment.