remove cache feature and use url normalizer to detect duplicate url

shedar · Jul 29, 2015 · 7602576 · 7602576
1 parent c90c812
commit 7602576
Show file tree

Hide file tree

Showing 5 changed files with 38 additions and 48 deletions.
diff --git a/README.md b/README.md
@@ -202,6 +202,8 @@ After [installing Docker](http://docs.docker.com/), you can run:
 Rough todolist
 --------------
 
+ * Using bottleneck to deal with rate limits
+ * Introducing zombie to deal with page with complex ajax
  * Refactoring the code to be more maintenable, it's spaghetti code in there !
  * Have a look at the Cache feature and refactor it
  * Same for the Pool

diff --git a/lib/crawler.js b/lib/crawler.js
@@ -8,10 +8,10 @@ var request = require('request');
 var _ = require('lodash');
 var jschardet = require('jschardet');
 var cheerio = require('cheerio');
-var zlib = require('zlib');
 var fs = require('fs');
 var Pool = require('generic-pool').Pool;
 var contentType = require('content-type');
+var seenreq = require('seenreq');
 var logger=null;
 // Fallback on iconv-lite if we didn't succeed compiling iconv
 // https://github.com/sylvinus/node-crawler/pull/29
@@ -27,8 +27,7 @@ if (!iconv) {
 function useCache (options) {
     return (
     options.uri &&
-    (options.cache || options.skipDuplicates) &&
-    (options.method === 'GET' || options.method === 'HEAD'));
+    (options.cache || options.skipDuplicates));
 }
 
 function checkJQueryNaming (options) {
@@ -109,6 +108,7 @@ Crawler.prototype.init = function init (options) {
     self.queueItemSize = 0;
 
     self.cache = {};
+    self.seen = new seenreq();
     logger = self.options.logger || console;
 
     self.on('pool:release', function(options) {
@@ -205,7 +205,7 @@ Crawler.prototype.queue = function queue (options) {
             _.isString(options[0]) ? { uri: options[0] } : options[0]
         );
     // Did you get multiple requests? Queue the URLs.
-    } else if (!_.isNull(options) && !_.isUndefined(options)) {
+    } else if (!_.isNull(options) && !_.isUndefined(options) && options.length>0) {
         self.queue(
             _.isString(options[0]) ? { uri: options[0] } : options[0]
         );
@@ -229,7 +229,7 @@ Crawler.prototype._pushToQueue = function _pushToQueue (options) {
     });
 
     // If duplicate skipping is enabled, avoid queueing entirely for URLs we already crawled
-    if (options.skipDuplicates && self.cache[options.uri]) {
+    if (options.skipDuplicates && self.seen.exists(options)) {
         return self.emit('pool:release', options);
     }
 
@@ -244,10 +244,10 @@ Crawler.prototype._pushToQueue = function _pushToQueue (options) {
             options.callback(error);
             return;
         }
-
+	
         //Static HTML was given, skip request
         if (options.html) {
-            self._onContent(null, options, {body:options.html}, false);
+            self._onContent(null, options, {body:options.html});
         } else if (typeof options.uri === 'function') {
             options.uri( function(uri) {
                 options.uri = uri;
@@ -264,38 +264,18 @@ Crawler.prototype._makeCrawlerRequest = function _makeCrawlerRequest (options) {
 
     if (typeof options.rateLimits === 'number' && options.rateLimits !== 0) {
         setTimeout(function() {
-            self._executeCrawlerRequest(options);
+            self._buildHttpRequest(options);//self._executeCrawlerRequest(options);
         }, options.rateLimits);
     } else {
-        self._executeCrawlerRequest(options);
-    }
-};
-
-Crawler.prototype._executeCrawlerRequest = function _executeCrawlerRequest (options) {
-    var self = this;
-    var cacheData = self.cache[options.uri];
-
-    //If a query has already been made to self URL, don't callback again
-    if (useCache(options) && cacheData) {
-
-        // Make sure we actually have cached data, and not just a note
-        // that the page was already crawled
-        if (_.isArray(cacheData)) {
-            self._onContent(null, options, cacheData[0], true);
-        } else {
-            self.emit('pool:release', options);
-        }
-
-    } else {
-        self._buildHttpRequest(options);
+        self._buildHttpRequest(options);//self._executeCrawlerRequest(options);
     }
 };
 
 Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
     var self = this;
 
     if (options.debug) {
-        logger.info(options.method+' '+options.uri+' ...');
+        logger.info(options.method+' '+options.uri+"?"+require("querystring").stringify(options.qs));
     }
 
     // Cloning keeps the opts parameter clean:
@@ -311,6 +291,7 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
         if (!ropts.headers['Accept-Charset'] && !ropts.headers['accept-charset']) {
             ropts.headers['Accept-Charset'] = 'utf-8;q=0.7,*;q=0.3';
         }
+
         if (!ropts.encoding) {
             ropts.encoding=null;
         }
@@ -352,7 +333,7 @@ Crawler.prototype._buildHttpRequest = function _buildHTTPRequest (options) {
         }
 
         response.uri = response.request.href;
-        self._onContent(error,options,response,false);
+        self._onContent(error,options,response);
     });
 };
 
@@ -390,7 +371,10 @@ Crawler.prototype._onContent = function _onContent (error, options, response, fr
         logger.info('Got '+(options.uri||'html')+' ('+response.body.length+' bytes)...');
     }
 
-    if (options.forceUTF8) {
+    if(options.encoding === null){
+	response.body = response.body
+    }
+    else if (options.forceUTF8) {
         //TODO check http header or meta equiv?
         var iconvObj;
 
@@ -463,24 +447,14 @@ Crawler.prototype._onContent = function _onContent (error, options, response, fr
         response.body = response.body.toString();
     }
 
-    if (useCache(options) && !fromCache) {
-        if (options.cache) {
-            self.cache[options.uri] = [response];
-
-            //If we don't cache but still want to skip duplicates we have to maintain a list of fetched URLs.
-        } else if (options.skipDuplicates) {
-            self.cache[options.uri] = true;
-        }
-    }
-
     if (!options.callback) {
         return self.emit('pool:release', options);
     }
 
     response.options = options;
 
     // This could definitely be improved by *also* matching content-type headers
-    var isHTML = response.body.match(/^\s*</);
+    var isHTML = _.isString(response.body) && response.body.match(/^\s*</);
 
     if (isHTML && options.jQuery && options.method !== 'HEAD') {
         self._inject(response, options, function(errors, $) {
@@ -500,4 +474,4 @@ Crawler.prototype._onInject = function _onInject (errors, options, response, $)
 };
 
 module.exports = Crawler;
-module.exports.VERSION = '0.4.9';
+module.exports.VERSION = '0.4.14';
diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "node-webcrawler",
-  "version": "0.4.9",
+  "version": "0.4.14",
   "description": "Crawler is a web spider written with Nodejs. It gives you the full power of jQuery on the server to parse a big number of pages as they are downloaded, asynchronously",
   "main": "./lib/crawler.js",
   "directories": {
@@ -21,7 +21,8 @@
     "iconv-lite": "0.4.8",
     "jschardet": "1.1.1",
     "lodash": "3.8.0",
-    "request": "2.55.0"
+    "request": "2.55.0",
+      "seenreq":"0.0.5"
   },
   "optionalDependencies": {
     "iconv": "*"

diff --git a/tests/encoding.test.js b/tests/encoding.test.js
@@ -22,6 +22,19 @@ describe('Encoding', function() {
             }
         }]);
     });
+    it('should return buffer if encoding = null', function(done) {
+        this.timeout(5000);
+        c.queue([{
+            uri: 'http://czyborra.com/charsets/iso8859.html',
+	    encoding:null,
+            callback: function(error, result) //noinspection BadExpressionStatementJS,BadExpressionStatementJS
+            {
+                expect(error).to.be.null;
+                expect(result.body instanceof Buffer).to.be.true;
+                done();
+            }
+        }]);
+    });
 });
 
 

diff --git a/tests/uriOption.test.js b/tests/uriOption.test.js
@@ -60,6 +60,6 @@ describe('Uri Options', function() {
             }
         });
         spy = sinon.spy(c, '_pushToQueue');
-        c.queue([undefined, 'http://'+httpbinHost]);
+        c.queue([undefined,null,[], 'http://'+httpbinHost]);
     });
-});
+});