-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Michael Klein
committed
Sep 8, 2013
1 parent
22de642
commit 3ff9463
Showing
12 changed files
with
440 additions
and
344 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
var Tools = require('../lib/tools'); | ||
var Db = require('./db'); | ||
var config = require('../config.js'); | ||
var $ = require("jquery"); | ||
|
||
var Analyse = function() { | ||
tools = Tools; | ||
// tools = new Tools(); | ||
db = new Db(); | ||
}; | ||
|
||
Analyse.prototype.scanTextBlock = function(snippet,counter) { | ||
if(config.creds.debug) { | ||
console.log('scanTextBlock'); | ||
} | ||
// split the text block to words | ||
var words = tools.tokenize(snippet); | ||
|
||
if(config.creds.debug) { | ||
console.log('Count of words in snippet ('+counter+'): '+words.length); | ||
} | ||
|
||
// create empty object | ||
var obj = {}; | ||
// var multi = client.multi(); | ||
db.enableMulti(); | ||
|
||
// loop all words | ||
for (var i = words.length - 1; i >= 0; i--) { | ||
|
||
// count all seen words | ||
if(typeof obj[words[i].toLowerCase()] == 'undefined') | ||
obj[words[i].toLowerCase()] = 1; | ||
else | ||
obj[words[i].toLowerCase()]++; | ||
|
||
// add every word to the queue to spread the scrape | ||
// multi.sadd('____sites2do____',words[i].toLowerCase()); | ||
db.addPageToQueue(words[i].toLowerCase()); | ||
|
||
// if(config.creds.debug) | ||
// console.log(words[i].toLowerCase()+'¥ - '+words[j].toLowerCase()+' - '+similar_text(words[i].toLowerCase(),words[j].toLowerCase(),1)); | ||
} | ||
|
||
// var base; | ||
|
||
$.each(obj, function(index, value) { | ||
|
||
// skip if not valid | ||
if(typeof index == 'undefined' || typeof index.toLowerCase == 'undefined') | ||
return; | ||
|
||
// create new obj from class Base, make sure to work with lowercase only | ||
// base = new Base(index.toLowerCase()); | ||
|
||
// loop all words | ||
$.each(obj, function(index2, value2) { | ||
if(index != index2) { | ||
// add relation, value2 is the counter of how often the word was seen in the recent textblock | ||
// base.pushRelation(index2.toLowerCase(),value2); | ||
db.addRelation(index.toLowerCase(),index2.toLowerCase(),value2); | ||
} | ||
}); | ||
|
||
// base.save(); | ||
|
||
// add to our general 'ALL' collection, to identify the most used words of all | ||
// multi.sadd('____all____', index.toLowerCase()); // add keyword | ||
// multi.incrby('____all____'+':'+index.toLowerCase(), value); // track its density | ||
db.addToGlobalCounter(value,index.toLowerCase()); | ||
|
||
}); | ||
|
||
// flush changes to database | ||
db.flush(function(err, replies) { | ||
return true; | ||
}); | ||
|
||
// multi.exec(function(err, replies) { | ||
// return true; | ||
// }); | ||
}; | ||
|
||
module.exports = Analyse; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
var Tools = require('../lib/tools'); | ||
var config = require('../config.js'); | ||
var redis = require("redis"); | ||
|
||
/* Used variables names | ||
* ____sites____ = wikipedia page titles which were scraped already | ||
* ____sites2do____ = wikipedia page titles which are queued to scrape | ||
* ____all____ = a collection of all ever seen words with a increment number | ||
*/ | ||
|
||
var Db = function() { | ||
// create redis client | ||
tools = Tools; | ||
that = this; | ||
client = redis.createClient(); | ||
isMulti = false; | ||
}; | ||
|
||
Db.prototype.enableMulti = function() { | ||
multi = client.multi(); | ||
isMulti = true; | ||
}; | ||
|
||
Db.prototype.disableMulti = function() { | ||
multi = null; | ||
isMulti = false; | ||
}; | ||
|
||
Db.prototype.addPageToQueue = function(s) { | ||
if(config.creds.debug) { | ||
console.log('addPageToQueue "'+s+'"'); | ||
} | ||
if(isMulti) { | ||
multi.sadd('____sites2do____',s); | ||
} | ||
else { | ||
client.sadd('<',s); | ||
} | ||
}; | ||
|
||
Db.prototype.removePageFromQueue = function(s) { | ||
client.srem('____sites2do____',s); | ||
}; | ||
|
||
Db.prototype.getRandomItemFromQueue = function(callback) { | ||
client.srandmember('____sites2do____', function (err, result) { | ||
return callback(result); | ||
}); | ||
}; | ||
|
||
Db.prototype.addPageAsDone = function(s, callback) { | ||
client.sadd('____sites____', s, callback); | ||
}; | ||
|
||
Db.prototype.removePageAsDone = function(s, callback) { | ||
client.srem('____sites____', s, callback); | ||
}; | ||
|
||
Db.prototype.addRelation = function(owner,relation,i) { | ||
if(isMulti) { | ||
multi.sadd(owner, relation); | ||
if(typeof i == 'undefined') { | ||
i = 1; | ||
} | ||
multi.incrby(owner+':'+relation, i); | ||
} | ||
else { | ||
client.sadd(owner, relation); | ||
if(typeof i == 'undefined') { | ||
i = 1; | ||
} | ||
client.incrby(owner+':'+relation, i); | ||
} | ||
}; | ||
|
||
Db.prototype.addToGlobalCounter = function(owner,s) { | ||
if(isMulti) { | ||
// add to our general 'ALL' collection, to identify the most used words of all | ||
multi.sadd('____all____', s); // add keyword | ||
multi.incrby('____all____'+':'+s, owner); // track its density | ||
} | ||
else { | ||
client.sadd('____all____', s); // add keyword | ||
client.incrby('<'+':'+s, owner); // track its density | ||
} | ||
}; | ||
|
||
Db.prototype.getTopRelations = function(owner, callback, res) { | ||
client.sort('____all____', "by", "____all____:*", 'LIMIT', 0, 500, 'DESC', "get", "#", function (err1, items1) { | ||
// get most often realted keywords for the given keyword | ||
client.sort(owner, "by", owner+":*", 'LIMIT', 0, 120, 'DESC', "get", "#", function (err2, items2) { | ||
// remove the noise by removing the most often used keywords | ||
callback(tools.inAButNotInB(items2,items1),res); | ||
}); | ||
}); | ||
}; | ||
|
||
Db.prototype.flush = function(callback) { | ||
if(false === isMulti || multi === null) { | ||
console.log('Error - you must use enableMulti() before flush().'); | ||
return; | ||
} | ||
callback = callback || null; | ||
multi.exec(callback); | ||
// that.disableMulti(); | ||
multi = null; | ||
isMulti = false; | ||
}; | ||
|
||
module.exports = Db; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
var restify = require('restify'); | ||
var express = require('express'); | ||
var Db = require('./db'); | ||
var config = require('../config'); | ||
|
||
var Http = function() { | ||
server = restify.createServer(); | ||
server.use(restify.bodyParser()); | ||
db = new Db(); | ||
that = this; | ||
|
||
// initialise routes | ||
that.initRoutes(); | ||
that.init(); | ||
}; | ||
|
||
Http.prototype.initRoutes = function() { | ||
if(config.creds.debug) { | ||
console.log('initRoutes'); | ||
} | ||
// Set up our routes | ||
server.get('/relations/:name', that.responseRelations); | ||
}; | ||
|
||
Http.prototype.init = function() { | ||
if(config.creds.debug) { | ||
console.log('start server'); | ||
} | ||
// start the server | ||
server.listen(config.creds.server_port, function() { | ||
console.log('%s listening at %s', server.name, server.url); | ||
}); | ||
}; | ||
|
||
Http.prototype.responseRelations = function(req, res, next) { | ||
if(config.creds.debug) { | ||
console.log('responseRelations for "'+req.params.name+'"'); | ||
} | ||
console.log('Will deliver top relations for requested word "'+req.params.name+'".'); | ||
db.getTopRelations(req.params.name, that.doResponse, res); | ||
}; | ||
|
||
Http.prototype.doResponse = function(data, res) { | ||
if(config.creds.debug) { | ||
console.log('doResponse'); | ||
} | ||
res.send(data); | ||
}; | ||
|
||
module.exports = Http; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
var Model = function(s) { | ||
val = s || null; | ||
}; | ||
|
||
Model.prototype.getValue = function() { | ||
return val; | ||
}; | ||
|
||
Model.prototype.setValue = function(s) { | ||
val = s; | ||
}; | ||
|
||
module.exports = Model; |
Oops, something went wrong.