Skip to content

Commit

Permalink
rewriten
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Klein committed Sep 8, 2013
1 parent 22de642 commit 3ff9463
Show file tree
Hide file tree
Showing 12 changed files with 440 additions and 344 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ The following examples where given after the system was collecting for about one

* install npm and node if you have not already (http://howtonode.org/introduction-to-npm or http://nodejs.org/)
* install / start your redis server (http://redis.io/topics/quickstart) on a disk with several free GB

* clone this repo "git clone https://github.com/monbro/opensemanticapi.git"
* change config if needed in "/config.js"
* NOTE: depending on what you want (scraper / cronjob to run or http api server, set 'http_server' to true or false)
* open the repository folder in your console
* enter "npm install", it will install all dependencies automatically
* start the node server with "node app.js"
* now it should print what it is collecting
* now it should print what it is collecting or what http route is requested
* the longer it collects data the better the results should be
* now you can access the relations through your browser like http://localhost:8080/relations/database or direct by accessing your redis server

Expand Down
73 changes: 22 additions & 51 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,73 +9,44 @@
* @version 0.1
*/

/* Used variables names
* ____sites____ = wikipedia page titles which were scraped already
* ____sites2do____ = wikipedia page titles which are queued to scrape
* ____all____ = a collection of all ever seen words with a increment number
*/

/**
* thrid part modules
* laod config
*/
var restify = require('restify');
var express = require('express');
var config = require('./config');
var tools = require('./lib/tools');
var redis = require("redis");
// var mongoose = require('mongoose');
var _ = require("underscore");
var $ = require("jquery");

/**
* Basic Objects
*/

// create redis client
var client = redis.createClient();

/**
* our classes
*/
var text = require("./classes/textanalyze")(client);
var wiki = require("./classes/wiki")(client);
var apiprovider = require("./classes/apiprovider")(client);

/**
* Objects
*/

// our basic app object
var App = function() {};

// our basic app
var app = new App();

// create restify server to server http api
var server = restify.createServer();
server.use(restify.bodyParser());
var Model = require('./app/model');

// create restify json client for api requests
var wikipedia = restify.createJsonClient({
url: 'http://'+config.creds.lang+'.wikipedia.org',
version: '*'
});
App.prototype.getModel = function(s) {
return new Model(s);
};

/**
* Run
*/
// var test = app.getModel('test');
// console.log(test.getValue());

// start api requests with given keyword
// wikiSearch('database'); // database can be replaced with a random name to start with

/**
* Routes
*/
if(!config.creds.http_server) {
var Scraper = require("./app/scraping");
var scraper = new Scraper();

// Set up our routes
server.get('/relations/:name', apiprovider.getRelations);
// Start Cronjob
scraper.wikiSearch('database');
}

/**
* Server
*/

// start the server
server.listen(config.creds.server_port, function() {
console.log('%s listening at %s', server.name, server.url);
});
// Start HTTP API RESTFUL Server
if(config.creds.http_server) {
var Http = require("./app/http");
var http = new Http();
}
84 changes: 84 additions & 0 deletions app/analyse.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
var Tools = require('../lib/tools');
var Db = require('./db');
var config = require('../config.js');
var $ = require("jquery");

var Analyse = function() {
tools = Tools;
// tools = new Tools();
db = new Db();
};

Analyse.prototype.scanTextBlock = function(snippet,counter) {
if(config.creds.debug) {
console.log('scanTextBlock');
}
// split the text block to words
var words = tools.tokenize(snippet);

if(config.creds.debug) {
console.log('Count of words in snippet ('+counter+'): '+words.length);
}

// create empty object
var obj = {};
// var multi = client.multi();
db.enableMulti();

// loop all words
for (var i = words.length - 1; i >= 0; i--) {

// count all seen words
if(typeof obj[words[i].toLowerCase()] == 'undefined')
obj[words[i].toLowerCase()] = 1;
else
obj[words[i].toLowerCase()]++;

// add every word to the queue to spread the scrape
// multi.sadd('____sites2do____',words[i].toLowerCase());
db.addPageToQueue(words[i].toLowerCase());

// if(config.creds.debug)
// console.log(words[i].toLowerCase()+'¥ - '+words[j].toLowerCase()+' - '+similar_text(words[i].toLowerCase(),words[j].toLowerCase(),1));
}

// var base;

$.each(obj, function(index, value) {

// skip if not valid
if(typeof index == 'undefined' || typeof index.toLowerCase == 'undefined')
return;

// create new obj from class Base, make sure to work with lowercase only
// base = new Base(index.toLowerCase());

// loop all words
$.each(obj, function(index2, value2) {
if(index != index2) {
// add relation, value2 is the counter of how often the word was seen in the recent textblock
// base.pushRelation(index2.toLowerCase(),value2);
db.addRelation(index.toLowerCase(),index2.toLowerCase(),value2);
}
});

// base.save();

// add to our general 'ALL' collection, to identify the most used words of all
// multi.sadd('____all____', index.toLowerCase()); // add keyword
// multi.incrby('____all____'+':'+index.toLowerCase(), value); // track its density
db.addToGlobalCounter(value,index.toLowerCase());

});

// flush changes to database
db.flush(function(err, replies) {
return true;
});

// multi.exec(function(err, replies) {
// return true;
// });
};

module.exports = Analyse;
110 changes: 110 additions & 0 deletions app/db.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
var Tools = require('../lib/tools');
var config = require('../config.js');
var redis = require("redis");

/* Used variables names
* ____sites____ = wikipedia page titles which were scraped already
* ____sites2do____ = wikipedia page titles which are queued to scrape
* ____all____ = a collection of all ever seen words with a increment number
*/

var Db = function() {
// create redis client
tools = Tools;
that = this;
client = redis.createClient();
isMulti = false;
};

Db.prototype.enableMulti = function() {
multi = client.multi();
isMulti = true;
};

Db.prototype.disableMulti = function() {
multi = null;
isMulti = false;
};

Db.prototype.addPageToQueue = function(s) {
if(config.creds.debug) {
console.log('addPageToQueue "'+s+'"');
}
if(isMulti) {
multi.sadd('____sites2do____',s);
}
else {
client.sadd('<',s);
}
};

Db.prototype.removePageFromQueue = function(s) {
client.srem('____sites2do____',s);
};

Db.prototype.getRandomItemFromQueue = function(callback) {
client.srandmember('____sites2do____', function (err, result) {
return callback(result);
});
};

Db.prototype.addPageAsDone = function(s, callback) {
client.sadd('____sites____', s, callback);
};

Db.prototype.removePageAsDone = function(s, callback) {
client.srem('____sites____', s, callback);
};

Db.prototype.addRelation = function(owner,relation,i) {
if(isMulti) {
multi.sadd(owner, relation);
if(typeof i == 'undefined') {
i = 1;
}
multi.incrby(owner+':'+relation, i);
}
else {
client.sadd(owner, relation);
if(typeof i == 'undefined') {
i = 1;
}
client.incrby(owner+':'+relation, i);
}
};

Db.prototype.addToGlobalCounter = function(owner,s) {
if(isMulti) {
// add to our general 'ALL' collection, to identify the most used words of all
multi.sadd('____all____', s); // add keyword
multi.incrby('____all____'+':'+s, owner); // track its density
}
else {
client.sadd('____all____', s); // add keyword
client.incrby('<'+':'+s, owner); // track its density
}
};

Db.prototype.getTopRelations = function(owner, callback, res) {
client.sort('____all____', "by", "____all____:*", 'LIMIT', 0, 500, 'DESC', "get", "#", function (err1, items1) {
// get most often realted keywords for the given keyword
client.sort(owner, "by", owner+":*", 'LIMIT', 0, 120, 'DESC', "get", "#", function (err2, items2) {
// remove the noise by removing the most often used keywords
callback(tools.inAButNotInB(items2,items1),res);
});
});
};

Db.prototype.flush = function(callback) {
if(false === isMulti || multi === null) {
console.log('Error - you must use enableMulti() before flush().');
return;
}
callback = callback || null;
multi.exec(callback);
// that.disableMulti();
multi = null;
isMulti = false;
};

module.exports = Db;
50 changes: 50 additions & 0 deletions app/http.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
var restify = require('restify');
var express = require('express');
var Db = require('./db');
var config = require('../config');

var Http = function() {
server = restify.createServer();
server.use(restify.bodyParser());
db = new Db();
that = this;

// initialise routes
that.initRoutes();
that.init();
};

Http.prototype.initRoutes = function() {
if(config.creds.debug) {
console.log('initRoutes');
}
// Set up our routes
server.get('/relations/:name', that.responseRelations);
};

Http.prototype.init = function() {
if(config.creds.debug) {
console.log('start server');
}
// start the server
server.listen(config.creds.server_port, function() {
console.log('%s listening at %s', server.name, server.url);
});
};

Http.prototype.responseRelations = function(req, res, next) {
if(config.creds.debug) {
console.log('responseRelations for "'+req.params.name+'"');
}
console.log('Will deliver top relations for requested word "'+req.params.name+'".');
db.getTopRelations(req.params.name, that.doResponse, res);
};

Http.prototype.doResponse = function(data, res) {
if(config.creds.debug) {
console.log('doResponse');
}
res.send(data);
};

module.exports = Http;
13 changes: 13 additions & 0 deletions app/model.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
var Model = function(s) {
val = s || null;
};

Model.prototype.getValue = function() {
return val;
};

Model.prototype.setValue = function(s) {
val = s;
};

module.exports = Model;
Loading

0 comments on commit 3ff9463

Please sign in to comment.