From 3ff9463486be71ebc8e7fa91008bb164bb59457d Mon Sep 17 00:00:00 2001 From: Michael Klein Date: Sun, 8 Sep 2013 23:32:49 +0100 Subject: [PATCH] rewriten --- README.md | 4 +- app.js | 73 ++++++-------------- app/analyse.js | 84 +++++++++++++++++++++++ app/db.js | 110 ++++++++++++++++++++++++++++++ app/http.js | 50 ++++++++++++++ app/model.js | 13 ++++ app/scraping.js | 149 +++++++++++++++++++++++++++++++++++++++++ classes/apiprovider.js | 88 ------------------------ classes/textanalyze.js | 70 ------------------- classes/wiki.js | 132 ------------------------------------ config.js | 5 +- lib/tools.js | 6 ++ 12 files changed, 440 insertions(+), 344 deletions(-) create mode 100644 app/analyse.js create mode 100644 app/db.js create mode 100644 app/http.js create mode 100644 app/model.js create mode 100644 app/scraping.js delete mode 100644 classes/apiprovider.js delete mode 100644 classes/textanalyze.js delete mode 100644 classes/wiki.js diff --git a/README.md b/README.md index 375a27e..15651bd 100644 --- a/README.md +++ b/README.md @@ -35,12 +35,14 @@ The following examples where given after the system was collecting for about one * install npm and node if you have not already (http://howtonode.org/introduction-to-npm or http://nodejs.org/) * install / start your redis server (http://redis.io/topics/quickstart) on a disk with several free GB + * clone this repo "git clone https://github.com/monbro/opensemanticapi.git" * change config if needed in "/config.js" +* NOTE: depending on what you want (scraper / cronjob to run or http api server, set 'http_server' to true or false) * open the repository folder in your console * enter "npm install", it will install all dependencies automatically * start the node server with "node app.js" -* now it should print what it is collecting +* now it should print what it is collecting or what http route is requested * the longer it collects data the better the results should be * now you can access the relations through your browser like http://localhost:8080/relations/database or direct by accessing your redis server diff --git a/app.js b/app.js index 2aeb5e1..3f23af1 100644 --- a/app.js +++ b/app.js @@ -9,73 +9,44 @@ * @version 0.1 */ -/* Used variables names - * ____sites____ = wikipedia page titles which were scraped already - * ____sites2do____ = wikipedia page titles which are queued to scrape - * ____all____ = a collection of all ever seen words with a increment number - */ - /** - * thrid part modules + * laod config */ -var restify = require('restify'); -var express = require('express'); var config = require('./config'); -var tools = require('./lib/tools'); -var redis = require("redis"); -// var mongoose = require('mongoose'); -var _ = require("underscore"); -var $ = require("jquery"); /** * Basic Objects */ -// create redis client -var client = redis.createClient(); - -/** - * our classes - */ -var text = require("./classes/textanalyze")(client); -var wiki = require("./classes/wiki")(client); -var apiprovider = require("./classes/apiprovider")(client); - -/** - * Objects - */ - +// our basic app object +var App = function() {}; +// our basic app +var app = new App(); -// create restify server to server http api -var server = restify.createServer(); -server.use(restify.bodyParser()); +var Model = require('./app/model'); -// create restify json client for api requests -var wikipedia = restify.createJsonClient({ - url: 'http://'+config.creds.lang+'.wikipedia.org', - version: '*' -}); +App.prototype.getModel = function(s) { + return new Model(s); +}; -/** - * Run - */ +// var test = app.getModel('test'); +// console.log(test.getValue()); -// start api requests with given keyword -// wikiSearch('database'); // database can be replaced with a random name to start with - -/** - * Routes - */ +if(!config.creds.http_server) { + var Scraper = require("./app/scraping"); + var scraper = new Scraper(); -// Set up our routes -server.get('/relations/:name', apiprovider.getRelations); + // Start Cronjob + scraper.wikiSearch('database'); +} /** * Server */ -// start the server -server.listen(config.creds.server_port, function() { - console.log('%s listening at %s', server.name, server.url); -}); \ No newline at end of file +// Start HTTP API RESTFUL Server +if(config.creds.http_server) { + var Http = require("./app/http"); + var http = new Http(); +} diff --git a/app/analyse.js b/app/analyse.js new file mode 100644 index 0000000..e058a52 --- /dev/null +++ b/app/analyse.js @@ -0,0 +1,84 @@ +var Tools = require('../lib/tools'); +var Db = require('./db'); +var config = require('../config.js'); +var $ = require("jquery"); + +var Analyse = function() { + tools = Tools; + // tools = new Tools(); + db = new Db(); +}; + +Analyse.prototype.scanTextBlock = function(snippet,counter) { + if(config.creds.debug) { + console.log('scanTextBlock'); + } + // split the text block to words + var words = tools.tokenize(snippet); + + if(config.creds.debug) { + console.log('Count of words in snippet ('+counter+'): '+words.length); + } + + // create empty object + var obj = {}; + // var multi = client.multi(); + db.enableMulti(); + + // loop all words + for (var i = words.length - 1; i >= 0; i--) { + + // count all seen words + if(typeof obj[words[i].toLowerCase()] == 'undefined') + obj[words[i].toLowerCase()] = 1; + else + obj[words[i].toLowerCase()]++; + + // add every word to the queue to spread the scrape + // multi.sadd('____sites2do____',words[i].toLowerCase()); + db.addPageToQueue(words[i].toLowerCase()); + + // if(config.creds.debug) + // console.log(words[i].toLowerCase()+'¥ - '+words[j].toLowerCase()+' - '+similar_text(words[i].toLowerCase(),words[j].toLowerCase(),1)); + } + + // var base; + + $.each(obj, function(index, value) { + + // skip if not valid + if(typeof index == 'undefined' || typeof index.toLowerCase == 'undefined') + return; + + // create new obj from class Base, make sure to work with lowercase only + // base = new Base(index.toLowerCase()); + + // loop all words + $.each(obj, function(index2, value2) { + if(index != index2) { + // add relation, value2 is the counter of how often the word was seen in the recent textblock + // base.pushRelation(index2.toLowerCase(),value2); + db.addRelation(index.toLowerCase(),index2.toLowerCase(),value2); + } + }); + + // base.save(); + + // add to our general 'ALL' collection, to identify the most used words of all + // multi.sadd('____all____', index.toLowerCase()); // add keyword + // multi.incrby('____all____'+':'+index.toLowerCase(), value); // track its density + db.addToGlobalCounter(value,index.toLowerCase()); + + }); + + // flush changes to database + db.flush(function(err, replies) { + return true; + }); + + // multi.exec(function(err, replies) { + // return true; + // }); +}; + +module.exports = Analyse; \ No newline at end of file diff --git a/app/db.js b/app/db.js new file mode 100644 index 0000000..32aa4f3 --- /dev/null +++ b/app/db.js @@ -0,0 +1,110 @@ +var Tools = require('../lib/tools'); +var config = require('../config.js'); +var redis = require("redis"); + +/* Used variables names + * ____sites____ = wikipedia page titles which were scraped already + * ____sites2do____ = wikipedia page titles which are queued to scrape + * ____all____ = a collection of all ever seen words with a increment number + */ + +var Db = function() { + // create redis client + tools = Tools; + that = this; + client = redis.createClient(); + isMulti = false; +}; + +Db.prototype.enableMulti = function() { + multi = client.multi(); + isMulti = true; +}; + +Db.prototype.disableMulti = function() { + multi = null; + isMulti = false; +}; + +Db.prototype.addPageToQueue = function(s) { + if(config.creds.debug) { + console.log('addPageToQueue "'+s+'"'); + } + if(isMulti) { + multi.sadd('____sites2do____',s); + } + else { + client.sadd('<',s); + } +}; + +Db.prototype.removePageFromQueue = function(s) { + client.srem('____sites2do____',s); +}; + +Db.prototype.getRandomItemFromQueue = function(callback) { + client.srandmember('____sites2do____', function (err, result) { + return callback(result); + }); +}; + +Db.prototype.addPageAsDone = function(s, callback) { + client.sadd('____sites____', s, callback); +}; + +Db.prototype.removePageAsDone = function(s, callback) { + client.srem('____sites____', s, callback); +}; + +Db.prototype.addRelation = function(owner,relation,i) { + if(isMulti) { + multi.sadd(owner, relation); + if(typeof i == 'undefined') { + i = 1; + } + multi.incrby(owner+':'+relation, i); + } + else { + client.sadd(owner, relation); + if(typeof i == 'undefined') { + i = 1; + } + client.incrby(owner+':'+relation, i); + } +}; + +Db.prototype.addToGlobalCounter = function(owner,s) { + if(isMulti) { + // add to our general 'ALL' collection, to identify the most used words of all + multi.sadd('____all____', s); // add keyword + multi.incrby('____all____'+':'+s, owner); // track its density + } + else { + client.sadd('____all____', s); // add keyword + client.incrby('<'+':'+s, owner); // track its density + } +}; + +Db.prototype.getTopRelations = function(owner, callback, res) { + client.sort('____all____', "by", "____all____:*", 'LIMIT', 0, 500, 'DESC', "get", "#", function (err1, items1) { + // get most often realted keywords for the given keyword + client.sort(owner, "by", owner+":*", 'LIMIT', 0, 120, 'DESC', "get", "#", function (err2, items2) { + // remove the noise by removing the most often used keywords + callback(tools.inAButNotInB(items2,items1),res); + }); + }); +}; + +Db.prototype.flush = function(callback) { + if(false === isMulti || multi === null) { + console.log('Error - you must use enableMulti() before flush().'); + return; + } + callback = callback || null; + multi.exec(callback); + // that.disableMulti(); + multi = null; + isMulti = false; +}; + +module.exports = Db; \ No newline at end of file diff --git a/app/http.js b/app/http.js new file mode 100644 index 0000000..5ff51b0 --- /dev/null +++ b/app/http.js @@ -0,0 +1,50 @@ +var restify = require('restify'); +var express = require('express'); +var Db = require('./db'); +var config = require('../config'); + +var Http = function() { + server = restify.createServer(); + server.use(restify.bodyParser()); + db = new Db(); + that = this; + + // initialise routes + that.initRoutes(); + that.init(); +}; + +Http.prototype.initRoutes = function() { + if(config.creds.debug) { + console.log('initRoutes'); + } + // Set up our routes + server.get('/relations/:name', that.responseRelations); +}; + +Http.prototype.init = function() { + if(config.creds.debug) { + console.log('start server'); + } + // start the server + server.listen(config.creds.server_port, function() { + console.log('%s listening at %s', server.name, server.url); + }); +}; + +Http.prototype.responseRelations = function(req, res, next) { + if(config.creds.debug) { + console.log('responseRelations for "'+req.params.name+'"'); + } + console.log('Will deliver top relations for requested word "'+req.params.name+'".'); + db.getTopRelations(req.params.name, that.doResponse, res); +}; + +Http.prototype.doResponse = function(data, res) { + if(config.creds.debug) { + console.log('doResponse'); + } + res.send(data); +}; + +module.exports = Http; \ No newline at end of file diff --git a/app/model.js b/app/model.js new file mode 100644 index 0000000..e957424 --- /dev/null +++ b/app/model.js @@ -0,0 +1,13 @@ +var Model = function(s) { + val = s || null; +}; + +Model.prototype.getValue = function() { + return val; +}; + +Model.prototype.setValue = function(s) { + val = s; +}; + +module.exports = Model; \ No newline at end of file diff --git a/app/scraping.js b/app/scraping.js new file mode 100644 index 0000000..772ef97 --- /dev/null +++ b/app/scraping.js @@ -0,0 +1,149 @@ +var Db = require('./db'); +var Analyse = require('./analyse'); +var restify = require('restify'); +var config = require('../config.js'); +var $ = require("jquery"); + +var Scraper = function() { + db = new Db(); + analyse = new Analyse(); + that = this; + debug_once = false; + wikipedia = restify.createJsonClient({ + url: 'http://'+config.creds.lang+'.wikipedia.org', + version: '*' + }); +}; + +Scraper.prototype.wikiSearch = function(s) { + if(config.creds.debug) { + console.log('wikiSearch'); + } + + // check if not empty string + if(typeof s == 'undefined' || s === '' || debug_once) { + console.log('wikiSearch - the given string is empty or undefined!'); + console.log('node will exit now.'); + process.exit(1); + return; + } + + wikipedia.get('/w/api.php?action=opensearch&search='+escape(s)+'&format=json&limit=3', function(err, req, res, data) { + if(typeof data[1] == 'undefined' || typeof data[1][0] == 'undefined') { + if(config.creds.debug) { + console.log('No page found in wikipedia for '+req.path); + } + db.removePageFromQueue(s); + that.goToNext(); + return; + } + + // get first matching result + var firstTitle = data[1][0]; + + // set first result as done + db.addPageAsDone(firstTitle,function (err, result) { + if(config.creds.debug) { + console.log('addPageAsDone callback'); + } + if(result) { + that.wikiGrab(firstTitle); + db.removePageFromQueue(firstTitle); + } + else { + if(config.creds.debug) { + console.log(firstTitle+' was crawled already!'); + } + that.goToNext(); + return false; + } + }); + + // add all sites to queue + for (var i = data[1].length - 1; i >= 0; i--) { + console.log('Added '+data[1][i]+' to queue!'); + db.addPageToQueue(data[1][i]); + } + + if(config.creds.debug) { + debug_once = true; + } + }); +}; + +Scraper.prototype.wikiGrab = function(s) { + if(config.creds.debug) { + console.log('wikiGrab with '+s); + } + wikipedia.get('/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles='+escape(s), function(err, req, res, data) { + if(typeof data.query == 'undefined') { + that.goToNext(); + return false; + } + + // check if valid content + if(typeof data.query.pages[Object.keys(data.query.pages)[0]].revisions == 'undefined') { + that.goToNext(); + return false; + } + + // get the main content of the wikipedia page + var rawtext = data.query.pages[Object.keys(data.query.pages)[0]].revisions[0]["*"]; + // now split the whole content into text blocks + var parts = rawtext.split(/\n|\r/); + var snippets = []; + + if(config.creds.debug) { + console.log('going to http://'+config.creds.lang+'.wikipedia.org/wiki/'+s); + } + + // loop all text blocks and pull these with more than config.creds.min_text_block_length (default: 120) chars + for (var i = parts.length - 1; i >= 0; i--) { + if(parts[i].length > config.creds.min_text_block_length) { + snippets.push(parts[i]); + } + } + + console.log('Will now process "'+s+'" with '+snippets.length+' text blocks.'); + + if(snippets.length > 0) { + // give the loop worker something to do + that.loopWorker(snippets); + } + else { + // restart fetch + that.goToNext(); + } + + }); +}; + +Scraper.prototype.loopWorker = function(snippets) { + if(config.creds.debug) { + console.log('loopWorker'); + } + // when snippetbox is empty, restart fetch + if(snippets.length === 0) { + if(config.creds.debug) { + console.log('Finished all Snippets!'); + } + that.goToNext(); + return; + } + + // analyze full text block + $.when(analyse.scanTextBlock(snippets.pop(),snippets.length)).done(function() { + // set a timeout to be gently to the memory and cpu + // (can be changed in the config file) + var t=setTimeout(function(){that.loopWorker(snippets);},config.creds.sleeptime); + }); +}; + +Scraper.prototype.goToNext = function() { + if(config.creds.debug) { + console.log('goToNext'); + } + db.getRandomItemFromQueue(that.wikiSearch); +}; + +module.exports = Scraper; \ No newline at end of file diff --git a/classes/apiprovider.js b/classes/apiprovider.js deleted file mode 100644 index 7826482..0000000 --- a/classes/apiprovider.js +++ /dev/null @@ -1,88 +0,0 @@ -module.exports = function(client) { - // var that = this; - - return { - /** - * function inAButNotInB will remove all items from array a which are in array b - * depending on underscore.js - * - * @param - * @return - */ - inAButNotInB: function (A, B) { - return _.filter(A, function (d) { - return !_.contains(B, d); - }); - }, - - Base: Base, - - /** - * function doResponse will send the response to the client - * - * @param string data - * @return res - */ - doResponse: function (data, res) { - res.send(data); - }, - - /** - * function getRelations will take action as a router function to deliver all relations to the requested keyword - * - * @param string req.params.name - * @return boolean - */ - getRelations: function (req, res, next) { - var base = new Base(req.params.name,client); - base.setRes(res); - base.getTopRelations(); - } - } -}; - -/** - * class Base will get handle database-actions related to one keyword - * - * @param string val - * @return boolean - */ -Base = function (val,client) { - // Store variables - var that = this, - multi_in = client.multi(), // to pipeline actions for redis - res; - - // to set the restify response, a bit hacky actually - this.setRes = function(val) { - res = val; - }; - - // process the pipelined actions in redis - this.save = function() { - multi_in.exec(); - }; - - // get all relationes, without the noise - this.getTopRelations = function() { - // get most often used keywords (limit 500) - client.smembers('____all____', function (err1, items1) { - console.log(items1); - return; - // get most often realted keywords for the given keyword - client.sort(val, "by", val+":*", 'LIMIT', 0, 120, 'DESC', "get", "#", function (err2, items2) { - // remove the noise by removing the most often used keywords - doResponse(inAButNotInB(items2,items1),res); - }); - }); - }; - - // add word and count up - this.pushRelation = function(rel, incr) { - multi_in.sadd(val, rel); - if(typeof incr == 'undefined') { - incr = 1; - } - multi_in.incrby(val+':'+rel, incr); - }; -}; \ No newline at end of file diff --git a/classes/textanalyze.js b/classes/textanalyze.js deleted file mode 100644 index 8d7625e..0000000 --- a/classes/textanalyze.js +++ /dev/null @@ -1,70 +0,0 @@ -module.exports = function(client) { - - return { - /** - * function analyzeText will get the content for the given wikipedia page title - * - * @param string title - * @return boolean - */ - analyzeText: function (snippet,counter) { - // split the text block to words - var words = tools.tokenize(snippet); - - if(config.creds.debug) - console.log('Count of words in snippet ('+counter+'): '+words.length); - - // create empty object - var obj = {}; - - var multi = client.multi(); - - // loop all words - for (var i = words.length - 1; i >= 0; i--) { - - // count all seen words - if(typeof obj[words[i].toLowerCase()] == 'undefined') - obj[words[i].toLowerCase()] = 1; - else - obj[words[i].toLowerCase()]++; - - // add every word to the queue to spread the scrape - multi.sadd('____sites2do____',words[i].toLowerCase()); - - // if(config.creds.debug) - // console.log(words[i].toLowerCase()+'¥ - '+words[j].toLowerCase()+' - '+similar_text(words[i].toLowerCase(),words[j].toLowerCase(),1)); - } - - var base; - - $.each(obj, function(index, value) { - - // skip if not valid - if(typeof index == 'undefined' || typeof index.toLowerCase == 'undefined') - return; - - // create new obj from class Base, make sure to work with lowercase only - base = new Base(index.toLowerCase()); - - // loop all words - $.each(obj, function(index2, value2) { - if(index != index2) { - // add relation, value2 is the counter of how often the word was seen in the recent textblock - base.pushRelation(index2.toLowerCase(),value2); - } - }); - - base.save(); - - // add to our general 'ALL' collection, to identify the most used words of all - multi.sadd('____all____', index.toLowerCase()); // add keyword - multi.incrby('____all____'+':'+index.toLowerCase(), value); // track its density - - }); - - multi.exec(function(err, replies) { - return true; - }); - } - } -}; \ No newline at end of file diff --git a/classes/wiki.js b/classes/wiki.js deleted file mode 100644 index 7581d77..0000000 --- a/classes/wiki.js +++ /dev/null @@ -1,132 +0,0 @@ -module.exports = function(client) { - return { - /** - * function wikiSearch will start the main processes to search for the best wikipedia page for the given string - * - * @param string term - * @return boolean - */ - wikiSearch: function (term) { - // do api call - wikipedia.get('/w/api.php?action=opensearch&search='+escape(term)+'&format=json&limit=3', function(err, req, res, data) { - - if(typeof data[1] == 'undefined' || typeof data[1][0] == 'undefined') { - if(config.creds.debug) - console.log('No page found in wikipedia for '+req.path); - client.srem('____sites2do____',term); - goToNext(); - return; - } - - // get first matching result - var firstTitle = data[1][0]; - - // set first result as done - client.sadd('____sites____', firstTitle, function (err, result) { - if(result) { - wikiGrab(firstTitle); - client.srem('____sites2do____',firstTitle); - } - else { - if(config.creds.debug) - console.log(firstTitle+' was crawled already!'); - goToNext(); - return false; - } - }); - - // add all sites to queue - for (var i = data[1].length - 1; i >= 0; i--) { - client.sadd('____sites2do____',data[1][i]); - } - - }); - }, - - /** - * function wikiGrab will get the content for the given wikipedia page title - * - * @param string title - * @return boolean - */ - wikiGrab: function (title) { - // do the api call - wikipedia.get('/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles='+escape(title), function(err, req, res, data) { - if(typeof data.query == 'undefined') { - goToNext(); - return false; - } - - // check if valid content - if(typeof data.query.pages[Object.keys(data.query.pages)[0]].revisions == 'undefined') { - goToNext(); - return false; - } - - // get the main content of the wikipedia page - var rawtext = data.query.pages[Object.keys(data.query.pages)[0]].revisions[0]["*"]; - // now split the whole content into text blocks - var parts = rawtext.split(/\n|\r/); - var snippets = []; - - if(config.creds.debug) - console.log('going to http://'+config.creds.lang+'.wikipedia.org/wiki/'+title); - - // loop all text blocks and pull these with more than config.creds.min_text_block_length (default: 120) chars - for (var i = parts.length - 1; i >= 0; i--) { - if(parts[i].length > config.creds.min_text_block_length) { - snippets.push(parts[i]); - } - } - - if(snippets.length > 0) { - // give the loop worker something to do - loopWorker(snippets); - } - else { - // restart fetch - goToNext(); - } - - }); - }, - - /** - * function loopWorker will process all snippets gently for your system - * - * @param array snippets - * @return - */ - loopWorker: function (snippets) { - // when snippetbox is empty, restart fetch - if(snippets.length === 0) { - if(config.creds.debug) - console.log('Count of snippets: '+snippets.length); - goToNext(); - return; - } - - // analyze full text block - $.when(text.analyzeText(snippets.pop(),snippets.length)).done(function() { - // set a timeout to be gently to the memory and cpu - // (can be changed in the config file) - var t=setTimeout(function(){loopWorker(snippets);},config.creds.sleeptime); - }); - }, - - /** - * function goToNext will move on to a random element to search for in the queue ____sites2do____ which is stored in redis - * - * @param - * @return - */ - goToNext: function () { - if(config.creds.debug) - console.log('NEXT'); - client.smembers('____sites2do____', function (err, result) { - var randomnr=Math.floor(Math.random()*result.length); - wikiSearch(result[randomnr]); - }); - } - } -}; \ No newline at end of file diff --git a/config.js b/config.js index db832b8..ed9e38a 100644 --- a/config.js +++ b/config.js @@ -1,11 +1,12 @@ // Don't commit this file to your public repos exports.creds = { + http_server: false, // default: false server_port: 8080, // default: 8080 sleeptime: 0, // sleep time in ms between the iteration of all textblocks mongoose_auth_local: 'mongodb://localhost/opensemanticapi', // default: mongodb://localhost/opensemanticapi min_text_block_length: 120, // default: 120 - debug: true, // default: true + debug: false, // default: true lang: 'en' // default: english // lang: 'de' // german - // lang: 'es' // spansih + // lang: 'es' // spanish }; \ No newline at end of file diff --git a/lib/tools.js b/lib/tools.js index d111e8f..4012db4 100644 --- a/lib/tools.js +++ b/lib/tools.js @@ -1,5 +1,11 @@ +var _ = require("underscore"); module.exports = { + inAButNotInB: function (A, B) { + return _.filter(A, function (d) { + return !_.contains(B, d); + }); + }, similar_text: function (first, second, percent) { // http://kevin.vanzonneveld.net // + original by: Rafał Kukawski (http://blog.kukawski.pl)