Skip to content

Commit

Permalink
Merge branch 'leveldb'
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Klein committed Sep 8, 2013
2 parents 1861bd1 + 3ff9463 commit a63259c
Show file tree
Hide file tree
Showing 10 changed files with 462 additions and 329 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,14 @@ The following examples where given after the system was collecting for about one

* install npm and node if you have not already (http://howtonode.org/introduction-to-npm or http://nodejs.org/)
* install / start your redis server (http://redis.io/topics/quickstart) on a disk with several free GB

* clone this repo "git clone https://github.com/monbro/opensemanticapi.git"
* change config if needed in "/config.js"
* NOTE: depending on what you want (scraper / cronjob to run or http api server, set 'http_server' to true or false)
* open the repository folder in your console
* enter "npm install", it will install all dependencies automatically
* start the node server with "node app.js"
* now it should print what it is collecting
* now it should print what it is collecting or what http route is requested
* the longer it collects data the better the results should be
* now you can access the relations through your browser like http://localhost:8080/relations/database or direct by accessing your redis server

Expand All @@ -49,5 +51,9 @@ The following examples where given after the system was collecting for about one
* improve performance of script
* allow certain configurations (half done)
* try to use http://yeoman.io/ with its structure for more compatibility and understanding
* write tests
* connect to travis-ci
* better folder structure
* write version using level db or similar

This software is published under the MIT-License. See 'license' for more information.
341 changes: 23 additions & 318 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,339 +9,44 @@
* @version 0.1
*/

/* Used variables names
* ____sites____ = wikipedia page titles which were scraped already
* ____sites2do____ = wikipedia page titles which are queued to scrape
* ____all____ = a collection of all ever seen words with a increment number
*/

/**
* Modules
*/
var restify = require('restify');
var express = require('express');
var config = require('./config.js');
var tools = require('./lib/tools');
var redis = require("redis");
// var mongoose = require('mongoose');
var _ = require("underscore");
var $ = require("jquery");

/**
* Objects
*/

// create redis client
var client = redis.createClient();

// create restify server to server http api
var server = restify.createServer();
server.use(restify.bodyParser());

// create restify json client for api requests
var wikipedia = restify.createJsonClient({
url: 'http://'+config.creds.lang+'.wikipedia.org',
version: '*'
});

/**
* Run
*/

// start api requests with given keyword
wikiSearch('database');

/**
* Helper functions
*/

/**
* function wikiSearch will start the main processes to search for the best wikipedia page for the given string
*
* @param string term
* @return boolean
*/
function wikiSearch(term) {
// do api call
wikipedia.get('/w/api.php?action=opensearch&search='+escape(term)+'&format=json&limit=3', function(err, req, res, data) {

if(typeof data[1] == 'undefined' || typeof data[1][0] == 'undefined') {
if(config.creds.debug)
console.log('No page found in wikipedia for '+req.path);
client.srem('____sites2do____',term);
goToNext();
return;
}

// get first matching result
var firstTitle = data[1][0];

// set first result as done
client.sadd('____sites____', firstTitle, function (err, result) {
if(result) {
wikiGrab(firstTitle);
client.srem('____sites2do____',firstTitle);
}
else {
if(config.creds.debug)
console.log(firstTitle+' was crawled already!');
goToNext();
return false;
}
});

// add all sites to queue
for (var i = data[1].length - 1; i >= 0; i--) {
client.sadd('____sites2do____',data[1][i]);
}

});
}

/**
* function wikiGrab will get the content for the given wikipedia page title
*
* @param string title
* @return boolean
*/
function wikiGrab(title) {
// do the api call
wikipedia.get('/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles='+escape(title), function(err, req, res, data) {
if(typeof data.query == 'undefined') {
goToNext();
return false;
}

// check if valid content
if(typeof data.query.pages[Object.keys(data.query.pages)[0]].revisions == 'undefined') {
goToNext();
return false;
}

// get the main content of the wikipedia page
var rawtext = data.query.pages[Object.keys(data.query.pages)[0]].revisions[0]["*"];
// now split the whole content into text blocks
var parts = rawtext.split(/\n|\r/);
var snippets = [];

if(config.creds.debug)
console.log('going to http://'+config.creds.lang+'.wikipedia.org/wiki/'+title);

// loop all text blocks and pull these with more than config.creds.min_text_block_length (default: 120) chars
for (var i = parts.length - 1; i >= 0; i--) {
if(parts[i].length > config.creds.min_text_block_length) {
snippets.push(parts[i]);
}
}

if(snippets.length > 0) {
// give the loop worker something to do
loopWorker(snippets);
}
else {
// restart fetch
goToNext();
}

});
}

/**
* function loopWorker will process all snippets gently for your system
*
* @param array snippets
* @return
*/
function loopWorker(snippets) {
// when snippetbox is empty, restart fetch
if(snippets.length === 0) {
if(config.creds.debug)
console.log('Count of snippets: '+snippets.length);
goToNext();
return;
}

// analyze full text block
$.when(analyzeText(snippets.pop(),snippets.length)).done(function() {
// set a timeout to be gently to the memory and cpu
// (can be changed in the config file)
var t=setTimeout(function(){loopWorker(snippets);},config.creds.sleeptime);
});
}

/**
* function goToNext will move on to a random element to search for in the queue ____sites2do____ which is stored in redis
*
* @param
* @return
*/
function goToNext() {
if(config.creds.debug)
console.log('NEXT');
client.srandmember('____sites2do____', function (err, result) {
wikiSearch(result);
});
}

/**
* function analyzeText will get the content for the given wikipedia page title
*
* @param string title
* @return boolean
*/
function analyzeText(snippet,counter) {

// split the text block to words
var words = tools.tokenize(snippet);

if(config.creds.debug)
console.log('Count of words in snippet ('+counter+'): '+words.length);

// create empty object
var obj = {};

var multi = client.multi();

// loop all words
for (var i = words.length - 1; i >= 0; i--) {

// count all seen words
if(typeof obj[words[i].toLowerCase()] == 'undefined')
obj[words[i].toLowerCase()] = 1;
else
obj[words[i].toLowerCase()]++;

// add every word to the queue to spread the scrape
multi.sadd('____sites2do____',words[i].toLowerCase());

// if(config.creds.debug)
// console.log(words[i].toLowerCase()+'¥ - '+words[j].toLowerCase()+' - '+similar_text(words[i].toLowerCase(),words[j].toLowerCase(),1));
}

var base;

$.each(obj, function(index, value) {

// skip if not valid
if(typeof index == 'undefined' || typeof index.toLowerCase == 'undefined')
return;

// create new obj from class Base, make sure to work with lowercase only
base = new Base(index.toLowerCase());

// loop all words
$.each(obj, function(index2, value2) {
if(index != index2) {
// add relation, value2 is the counter of how often the word was seen in the recent textblock
base.pushRelation(index2.toLowerCase(),value2);
}
});

base.save();

// add to our general 'ALL' collection, to identify the most used words of all
multi.sadd('____all____', index.toLowerCase()); // add keyword
multi.incrby('____all____'+':'+index.toLowerCase(), value); // track its density

});

multi.exec(function(err, replies) {
return true;
});
}

/**
* function inAButNotInB will remove all items from array a which are in array b
* depending on underscore.js
*
* @param
* @return
* laod config
*/
function inAButNotInB(A, B) {
return _.filter(A, function (d) {
return !_.contains(B, d);
});
}
var config = require('./config');

/**
* class Base will get handle database-actions related to one keyword
*
* @param string val
* @return boolean
* Basic Objects
*/
function Base(val) {

// Store variables
var that = this,
multi_in = client.multi(), // to pipeline actions for redis
res;
// our basic app object
var App = function() {};

// to set the restify response, a bit hacky actually
this.setRes = function(val) {
res = val;
};
// our basic app
var app = new App();

// process the pipelined actions in redis
this.save = function() {
multi_in.exec();
};
var Model = require('./app/model');

// get all relationes, without the noise
this.getTopRelations = function() {
// get most often used keywords (limit 500)
client.sort('____all____', "by", "____all____:*", 'LIMIT', 0, 500, 'DESC', "get", "#", function (err1, items1) {
// get most often realted keywords for the given keyword
client.sort(val, "by", val+":*", 'LIMIT', 0, 120, 'DESC', "get", "#", function (err2, items2) {
// remove the noise by removing the most often used keywords
doResponse(inAButNotInB(items2,items1),res);
});
});
};
App.prototype.getModel = function(s) {
return new Model(s);
};

// add word and count up
this.pushRelation = function(rel, incr) {
multi_in.sadd(val, rel);
if(typeof incr == 'undefined') {
incr = 1;
}
multi_in.incrby(val+':'+rel, incr);
};
}
// var test = app.getModel('test');
// console.log(test.getValue());

/**
* function doResponse will send the response to the client
*
* @param string data
* @return res
*/
function doResponse(data, res) {
res.send(data);
}
if(!config.creds.http_server) {
var Scraper = require("./app/scraping");
var scraper = new Scraper();

/**
* function getRelations will take action as a router function to deliver all relations to the requested keyword
*
* @param string req.params.name
* @return boolean
*/
function getRelations(req, res, next) {
var base = new Base(req.params.name);
base.setRes(res);
base.getTopRelations();
// Start Cronjob
scraper.wikiSearch('database');
}

/**
* Routes
*/

// Set up our routes
server.get('/relations/:name', getRelations);

/**
* Server
*/

// start the server
server.listen(config.creds.server_port, function() {
console.log('%s listening at %s', server.name, server.url);
});
// Start HTTP API RESTFUL Server
if(config.creds.http_server) {
var Http = require("./app/http");
var http = new Http();
}
Loading

0 comments on commit a63259c

Please sign in to comment.