From fce9fcef4cec7eb6c00ff74bcb2a552044f0465e Mon Sep 17 00:00:00 2001
From: Kasun Perera <kkasunperera@gmail.com>
Date: Mon, 23 Sep 2013 10:57:05 +0530
Subject: [PATCH] Tool for processing Wikipedia Categories

This contains all the source code written as part of the Google Summer
of Code program- 2013
Project- Type inference to extend coverage
Student- Kasun Perera
Mentor- Marco Fossati
---
 WikipediaCategoryProcessor/.gitignore         |    4 +
 WikipediaCategoryProcessor/pom.xml            |   69 +
 .../kasun/categoryprocessor/Category.java     |   25 +
 .../kasun/categoryprocessor/CategoryDB.java   |  286 ++++
 .../categoryprocessor/CategoryLinksDB.java    |  363 +++++
 .../categoryprocessor/CategoryProcesor.java   |  130 ++
 .../categoryprocessor/DB_connection.java      |   30 +
 .../kasun/categoryprocessor/DataProcesor.java |   77 +
 .../kasun/categoryprocessor/EdgeDB.java       |  203 +++
 .../kasun/categoryprocessor/Edges.java        |  122 ++
 .../dbpedia/kasun/categoryprocessor/Node.java |  133 ++
 .../kasun/categoryprocessor/NodeDB.java       |  268 ++++
 .../dbpedia/kasun/categoryprocessor/Page.java |   51 +
 .../kasun/categoryprocessor/PageDB.java       |  296 ++++
 .../categoryprocessor/languageLinksDB.java    |   64 +
 .../kasun/extractdata/DataExtractor.java      |   72 +
 .../dbpedia/kasun/extractdata/DoSearch.java   |   98 ++
 .../dbpedia/kasun/freebasequery/Mqlread.java  |   74 +
 .../dbpedia/kasun/freebasequery/QueryFB.java  |   71 +
 .../java/org/dbpedia/kasun/indexer/Index.java |  415 +++++
 .../org/dbpedia/kasun/rdf/RdfGenarator.java   |  143 ++
 .../org/dbpedia/kasun/searcher/Search.java    |  267 ++++
 .../dbpedia/kasun/wikiquery/ReadXMLFile.java  |  134 ++
 .../kasun/wikiquery/RevisionHistory.java      |  110 ++
 .../dbpedia/kasun/wikiquery/WikiQuery.java    |   49 +
 .../org/yago/javatools/administrative/D.java  |  417 +++++
 .../javatools/administrative/Elements.java    |   64 +
 .../yago/javatools/datatypes/FinalMap.java    |   41 +
 .../yago/javatools/datatypes/FinalSet.java    |   73 +
 .../java/org/yago/javatools/parsers/Char.java | 1404 +++++++++++++++++
 .../org/yago/javatools/parsers/NounGroup.java |  263 +++
 .../yago/javatools/parsers/PlingStemmer.java  |  923 +++++++++++
 .../javatools/parsers/PositionTracker.java    |  288 ++++
 .../wikipediacategoryprocessor/AppTest.java   |   38 +
 34 files changed, 7065 insertions(+)
 create mode 100644 WikipediaCategoryProcessor/.gitignore
 create mode 100644 WikipediaCategoryProcessor/pom.xml
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Category.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryLinksDB.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DB_connection.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/EdgeDB.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Edges.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Node.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Page.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/PageDB.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/languageLinksDB.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DataExtractor.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DoSearch.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/Mqlread.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/QueryFB.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/indexer/Index.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/rdf/RdfGenarator.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/searcher/Search.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/ReadXMLFile.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/RevisionHistory.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/WikiQuery.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/Elements.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalMap.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalSet.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/Char.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/NounGroup.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java
 create mode 100644 WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java
 create mode 100644 WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java

diff --git a/WikipediaCategoryProcessor/.gitignore b/WikipediaCategoryProcessor/.gitignore
new file mode 100644
index 0000000..5168a9a
--- /dev/null
+++ b/WikipediaCategoryProcessor/.gitignore
@@ -0,0 +1,4 @@
+/target/
+/nbactions.xml
+/nbactions-release-profile.xml
+
diff --git a/WikipediaCategoryProcessor/pom.xml b/WikipediaCategoryProcessor/pom.xml
new file mode 100644
index 0000000..f85ebb9
--- /dev/null
+++ b/WikipediaCategoryProcessor/pom.xml
@@ -0,0 +1,69 @@
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>org.karsha</groupId>
+    <artifactId>WikipediaCategoryProcessor</artifactId>
+    <version>1.0-SNAPSHOT</version>
+    <packaging>jar</packaging>
+
+    <name>WikipediaCategoryProcessor</name>
+    <url>http://maven.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>junit</groupId>
+            <artifactId>junit</artifactId>
+            <version>3.8.1</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>mysql</groupId>
+            <artifactId>mysql-connector-java</artifactId>
+            <version>5.1.25</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-core</artifactId>
+            <version>4.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>com.jayway.jsonpath</groupId>
+            <artifactId>json-path</artifactId>
+            <version>0.8.1</version>
+        </dependency>
+            
+        <dependency>
+            <groupId>org.apache.clerezza.ext</groupId>
+            <artifactId>org.json.simple</artifactId>
+            <version>0.3-incubating</version>
+        </dependency>
+        <dependency>
+            <groupId>com.google.api-client</groupId>
+            <artifactId>google-api-client</artifactId>
+            <version>1.16.0-rc</version>
+        </dependency>
+            
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-queries</artifactId>
+            <version>4.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-queryparser</artifactId>
+            <version>4.3.1</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.lucene</groupId>
+            <artifactId>lucene-analyzers-common</artifactId>
+            <version>4.3.1</version>
+        </dependency>
+            
+    </dependencies>
+</project>
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Category.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Category.java
new file mode 100644
index 0000000..8d5442c
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Category.java
@@ -0,0 +1,25 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ * 
+ *      Date             Author          Changes 
+ *      Jul 20, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.categoryprocessor;
+
+
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+public class Category {
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java
new file mode 100644
index 0000000..cf51917
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java
@@ -0,0 +1,286 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ * Date Author Changes Jul 6, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.io.*;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class CategoryDB
+{
+
+  
+
+    public static int getCategoryPageCount( int threshold )
+    {
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+        int updateQuery = 0;
+
+        String query = "SELECT COUNT(*) FROM `page_category` WHERE `cat_subcats`=0  AND `cat_pages`< ? ";
+
+
+        try
+        {
+            ps = connection.prepareStatement( query );
+            ps.setInt( 1, threshold );
+
+            rs = ps.executeQuery();
+            int nodeId = 0;
+            while ( rs.next() )
+            {
+                nodeId = rs.getInt( 1 );
+            }
+            return nodeId;
+        } catch ( SQLException e )
+        {
+            e.printStackTrace();
+            return 0;
+        }
+
+    }
+
+        public static void getCategoryByName(String line) throws IOException
+    {
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+         FileWriter outFile;
+        
+                PreparedStatement ps = null;
+                ResultSet rs = null;
+                int updateQuery = 0;
+                String temp = null;
+
+ 
+
+                // System.out.println(line);
+                // System.out.println(temp);
+                
+                String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` LIKE ? ";
+//String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` = ? ";
+//String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` ="+catTitle;
+
+
+                try
+                {
+                    ps = connection.prepareStatement( query );
+                    // ps.setString( 1, temp );
+                    ps.setString( 1, line );
+                    rs = ps.executeQuery();
+                    int count = 0;
+
+                    if ( rs.next() )
+                    {
+                        do
+                        {
+                            //outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\category_match_article_pages.txt", true );
+                            //outFile.append( rs.getString( "cat_id" ) + "\t" + rs.getString( "cat_title" ) + "\t" + rs.getString( "cat_pages" ) + "\t" + rs.getString( "cat_subcats" ) + "\t" + rs.getString( "cat_files" ) + "\t" + rs.getString( "cat_hidden" ) + "\n" );
+                           // outFile.close();
+                            insertCategory( rs.getInt( "cat_id"), rs.getString( "cat_title" ), rs.getInt( "cat_pages"), rs.getInt( "cat_subcats"), rs.getInt( "cat_files"), rs.getBoolean( "cat_hidden" ) );
+                            count++;
+                            if(count>1){
+                               System.out.println( count+" count is over one " + line);
+                            }
+                        } while ( rs.next() );
+                    } else
+                    {
+
+                        outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_not_found_in_category_table_2.txt", true );
+                            outFile.append( line+ "\n" );
+                           outFile.close();
+                        
+                        //System.out.println( line );
+                        // No data
+                    }
+
+
+
+                     connection.close();
+                } catch ( SQLException e )
+                {
+                    e.printStackTrace();
+                    // return 0;
+                }
+ 
+
+
+    }
+    
+    
+    public static void getCategoryDirectedByArticlePage(String line) throws IOException
+    {
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+     
+                
+                 String lineArr[];
+                PreparedStatement ps = null;
+                ResultSet rs = null;
+                int updateQuery = 0;
+               
+                lineArr=line.split("\t");
+             
+                // System.out.println(line);
+                // System.out.println(temp);
+                String query = "SELECT cl_from, cl_to, cl_type FROM `categorylinks` WHERE `cl_from` =" + lineArr[0].trim() ;
+
+
+                try
+                {
+                    ps = connection.prepareStatement( query );
+                    // ps.setString( 1, temp );
+                    //ps.setString( 1, catTitle );
+                    rs = ps.executeQuery();
+                    int count = 0;
+
+                    if ( rs.next() )
+                    {
+                        do
+                        {
+                             FileWriter outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_match_article_pages_v1.txt", true );
+                            outFile.append( rs.getInt( "cl_from" ) + "\t" + rs.getString( "cl_to" ) + "\t" + rs.getString( "cl_type" ) + "\n" );
+                            outFile.close();
+                            count++;
+                        } while ( rs.next() );
+                    } else
+                    {
+
+                        FileWriter outFileCatNotFound = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_not_found_article_pages_v1.txt", true );
+                            outFileCatNotFound.append( line + "\n" );
+                            outFileCatNotFound.close();
+                        
+                        //System.out.println( line +"\t no category found");
+                        // No data
+                    }
+
+
+
+                   connection.close();
+                } catch ( SQLException e )
+                {
+                    e.printStackTrace();
+                    // return 0;
+                }
+            //}
+        //}
+
+
+
+    }
+    
+    public static void getCategoryLinkByCatName(String line) throws IOException
+    {
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+     
+                
+                // String lineArr[];
+                PreparedStatement ps = null;
+                ResultSet rs = null;
+                int updateQuery = 0;
+               
+               // lineArr=line.split("\t");
+             
+                // System.out.println(line);
+                // System.out.println(temp);
+                String query = "SELECT cl_from FROM `categorylinks` WHERE `cl_to` LIKE " + line.trim() ;
+
+
+                try
+                {
+                    ps = connection.prepareStatement( query );
+                    // ps.setString( 1, temp );
+                    //ps.setString( 1, catTitle );
+                    rs = ps.executeQuery();
+                    int count = 0;
+
+                    if ( rs.next() )
+                    {
+                        do
+                        {
+                            
+                        //if caegory does not have 
+                            if(!PageDB.isArticlePage( rs.getInt("cl_from") )){
+                          
+                            }
+//                             FileWriter outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_match_article_pages_v1.txt", true );
+//                            outFile.append( rs.getInt( "cl_from" ) + "\t" + rs.getString( "cl_to" ) + "\t" + rs.getString( "cl_type" ) + "\n" );
+//                            outFile.close();
+//                            count++;
+                        } while ( rs.next() );
+                    } 
+
+
+
+                   connection.close();
+                } catch ( SQLException e )
+                {
+                    e.printStackTrace();
+                    // return 0;
+                }
+            //}
+        //}
+
+
+
+    }
+
+    public static void insertCategory( int cat_id,String cat_title, int cat_pages,int cat_subcats,int cat_files,boolean cat_hidden)
+    {
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+        int updateQuery = 0;
+        /*
+         *   `cat_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
+  `cat_title` varbinary(255) NOT NULL DEFAULT '',
+  `cat_pages` int(11) NOT NULL DEFAULT '0',
+  `cat_subcats` int(11) NOT NULL DEFAULT '0',
+  `cat_files` int(11) NOT NULL DEFAULT '0',
+  `cat_hidden` tinyint(1) unsigned NOT NULL DEFAULT '0',
+         */
+
+        String query = "INSERT IGNORE INTO page_category(cat_id,cat_title,cat_pages,cat_subcats,cat_files,cat_hidden) VALUES (?,?,?,?,?,?)";
+
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt(1, cat_id);
+            ps.setString( 2, cat_title);
+            ps.setInt(3, cat_pages);
+            ps.setInt( 4, cat_subcats);
+            ps.setInt( 5, cat_files);
+            ps.setBoolean( 6, cat_hidden);
+            updateQuery = ps.executeUpdate();
+           
+           connection.close();
+            
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+           // return null;
+        }
+
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryLinksDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryLinksDB.java
new file mode 100644
index 0000000..a7af666
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryLinksDB.java
@@ -0,0 +1,363 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Aug 13, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.io.*;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedList;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.dbpedia.kasun.searcher.Search;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class CategoryLinksDB
+{
+
+    public static void getCategoryByPageID() throws IOException
+    {
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+        FileWriter outFile;
+        FileWriter outFile1;
+        int pageID;
+        String leafcategory;
+
+
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+        int updateQuery = 0;
+        String temp = null;
+
+
+
+
+        // System.out.println(line);
+        // System.out.println(temp);
+
+        //   String query = "SELECT cl_to  FROM `categorylinks` WHERE `cl_from` = ? ";
+
+        // String query = "SELECT `cl_to` FROM  `category_only_page` JOIN  `categorylinks` ON  `category_only_page`.`page_id` =  `categorylinks`.`cl_from` WHERE  `page_title` =  '"+leafcategory+"'";
+
+        try
+        {
+
+
+            File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\page_id_page_title_leaf_categories_page_less_than_90.txt" );
+
+            String line;
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( catPagesFile ) );
+            //FileWriter outFile;
+            // FileWriter outFileCatNotFound;
+
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+                if ( !line.isEmpty() )
+                {
+                    String splitLine[] = line.split( "\t" );
+                    leafcategory = splitLine[1].trim();
+                    pageID = Integer.valueOf( splitLine[0] );
+
+                    String query = "SELECT `cl_to` FROM  `categorylinks` WHERE  `cl_from` =  " + splitLine[0].trim();
+
+
+
+
+                    ps = connection.prepareStatement( query );
+                    // ps.setString( 1, temp );
+                    // ps.setInt( 1, pageID );
+                    rs = ps.executeQuery();
+                    int count = 0;
+
+                    if ( rs.next() )
+                    {
+                        NodeDB.insertNode( pageID, leafcategory );
+                        // int childID= NodeDB.getCategoryId( leafcategory );
+                        do
+                        {
+                            //outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\category_match_article_pages.txt", true );
+                            //outFile.append( rs.getString( "cat_id" ) + "\t" + rs.getString( "cat_title" ) + "\t" + rs.getString( "cat_pages" ) + "\t" + rs.getString( "cat_subcats" ) + "\t" + rs.getString( "cat_files" ) + "\t" + rs.getString( "cat_hidden" ) + "\n" );
+                            // outFile.close();
+                            //insertCategory( rs.getInt( "cat_id"), rs.getString( "cat_title" ), rs.getInt( "cat_pages"), rs.getInt( "cat_subcats"), rs.getInt( "cat_files"), rs.getBoolean( "cat_hidden" ) );
+                            int parentID = PageDB.getPageId( rs.getString( "cl_to" ).trim() );
+                            if ( parentID > 0 )
+                            {
+                                NodeDB.insertNode( parentID, rs.getString( "cl_to" ).trim() );
+                                // int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) );
+
+                                EdgeDB.insertEdge( parentID, pageID );
+                            } else
+                            {
+                                outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\Parent_child_not_inderted_to_node_table.txt", true );
+                                outFile1.append( rs.getString( "cl_to" ).trim() + "\n" );
+                                outFile1.close();
+                            }
+                            count++;
+
+                        } while ( rs.next() );
+                    } else
+                    {
+
+                        outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_pages_not_found_in_page_table.txt", true );
+                        outFile.append( pageID + "\t" + leafcategory + "\n" );
+                        outFile.close();
+
+                        //System.out.println( line );
+                        // No data
+                    }
+
+                    System.out.println( count );
+                }
+            }
+
+            connection.close();
+        } catch ( SQLException e )
+        {
+            e.printStackTrace();
+            // return 0;
+        }
+
+
+
+    }
+
+    public static void insertParentChild() throws IOException, ParseException
+    {
+
+
+        FileWriter outFile;
+        FileWriter outFile1;
+        FileWriter outFile2;
+        int pageID;
+        // int catID;
+        String leafcategory;
+
+
+
+        int updateQuery = 0;
+        String temp = null;
+
+
+
+
+        try
+        {
+
+
+            File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\leaf_categories_page_less_than_90.txt" );
+
+            String line;
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( catPagesFile ) );
+            //FileWriter outFile;
+            // FileWriter outFileCatNotFound;
+
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+                if ( !line.isEmpty() )
+                {
+                    String splitLine[] = line.split( "\t" );
+                    leafcategory = splitLine[1].trim();
+                    // catID= ;
+                    pageID = PageDB.getPageId( leafcategory );
+
+                    if ( pageID > 0 )
+                    {
+                        NodeDB.insertNode( pageID, leafcategory );
+
+                        /*
+                         * search index and get the cl_to by pageID
+                         */
+
+                        ArrayList<String> listOfClTo = Search.SearchCatPageLinks( pageID );
+
+                        for ( int i = 0; i < listOfClTo.size(); i++ )
+                        {
+
+                            int parentID = PageDB.getPageId( listOfClTo.get( i ) );
+                            if ( parentID > 0 )
+                            {
+                                NodeDB.insertNode( parentID, listOfClTo.get( i ) );
+                                // int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) );
+
+                                EdgeDB.insertEdge( parentID, pageID );
+                            } else
+                            {
+                                outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Parent_child_not_inderted_to_node_table_V2.txt", true );
+                                outFile1.append( listOfClTo.get( i ) + "\n" );
+                                outFile1.close();
+                            }
+                            // count++;
+
+                        }
+                    } else
+                    {
+                        outFile2 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Child_nodes_not_inderted_to_node_table_V2.txt", true );
+                        outFile2.append( line + "\n" );
+                        outFile2.close();
+                    }
+                }
+            }
+        } catch ( Exception e )
+        {
+            e.printStackTrace();
+            // return 0;
+        }
+
+
+
+    }
+
+    public static void insertParentChildModified() throws IOException, ParseException
+    {
+
+
+        FileWriter outFile;
+        FileWriter outFile1;
+        FileWriter outFile2;
+     
+        // int catID;
+        String leafcategory;
+
+
+
+        int updateQuery = 0;
+        String temp = null;
+
+int count=0;
+
+
+        try
+        {
+
+
+            File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\leaf_categories_page_less_than_90_edited_4.txt" );
+
+            String line;
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( catPagesFile ) );
+            //FileWriter outFile;
+            // FileWriter outFileCatNotFound;
+
+           // HashMap<String, Integer> pageMap = PageDB.getAllPages();
+
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+                if ( !line.isEmpty() )
+                {
+                    String splitLine[] = line.split( "\t" );
+                    leafcategory = splitLine[1].trim();
+                    // catID= ;
+                    // pageID = PageDB.getPageId( leafcategory );
+                       int pageID=0;
+                       LinkedList<Integer> pageIdList= Search.SearchCategoryPages( leafcategory );
+                      if(!pageIdList.isEmpty() ){
+                            pageID =pageIdList.get(0);
+                       }
+                   
+                    if ( pageID > 0 )
+                    {
+                        NodeDB.insertNode( pageID, leafcategory );
+
+                        /*
+                         * search index and get the cl_to by pageID
+                         */
+
+                        ArrayList<String> listOfClTo = Search.SearchCatPageLinks( pageID );
+
+                        for ( int i = 0; i < listOfClTo.size(); i++ )
+                        {
+                            int parentID = 0;
+                            // int parentID = PageDB.getPageId( listOfClTo.get( i ) );
+                            
+                                 LinkedList<Integer> parentIdList= Search.SearchCategoryPages( listOfClTo.get( i ) );
+                      if(!parentIdList.isEmpty() ){
+                            parentID =parentIdList.get(0);
+                       }
+                            if ( parentID > 0 )
+                            {
+                                NodeDB.insertNode( parentID, listOfClTo.get( i ) );
+                                // int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) );
+
+                                EdgeDB.insertEdge( parentID, pageID );
+                            } else
+                            {
+                                outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Parent_child_not_inderted_to_node_table_V2.txt", true );
+                                outFile1.append( listOfClTo.get( i ) + "\n" );
+                                outFile1.close();
+                            }
+                            // count++;
+
+                        }
+                    } else
+                    {
+                        outFile2 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Child_nodes_not_inderted_to_node_table_V2.txt", true );
+                        outFile2.append( line + "\n" );
+                        outFile2.close();
+                    }
+                }
+                 count++;
+                 System.out.println(count);
+            }
+        } catch ( Exception e )
+        {
+            e.printStackTrace();
+            // return 0;
+        }
+
+
+
+    }
+    
+    public static ArrayList<Integer> getPagesLinkedByCatName( String catName )
+    {
+        
+         DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+          
+          ArrayList<Integer> listOfPages= new  ArrayList<Integer>();
+        
+         String query =  "select cl_from from categorylinks where cl_to=?";
+
+ 
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setString( 1, catName);
+           
+             rs = ps.executeQuery();
+         
+            while (rs.next())
+            {
+                listOfPages.add(rs.getInt( "cl_from" ) );
+           }
+            connection.close();
+            return listOfPages;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return null;
+        }
+        
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java
new file mode 100644
index 0000000..9db297b
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java
@@ -0,0 +1,130 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.io.*;
+import java.util.Scanner;
+import org.apache.lucene.queryparser.classic.ParseException;
+
+/**
+ *
+ * Date Author Changes Jun 28, 2013 Kasun Perera Created
+ *
+ */
+public class CategoryProcesor
+{
+
+    /**
+     * @param args the command line arguments
+     */
+    public static void main( String[] args ) throws IOException, ParseException
+    {
+        
+        Edges edge= new Edges();
+        edge.findProminetNodes();
+      // CategoryLinksDB.insertParentChildModified();
+     //   PageDB.getAllPages();
+        
+        /*
+        // inser category_only_pages
+        
+                  //File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_dir\\pages_page_namespace_14_new_complete_line.txt" );
+       
+        
+        
+                  File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\leaf_categories_page_less_than_90.txt" );
+      
+                  String line;
+        BufferedReader fileReader;
+        fileReader = new BufferedReader( new FileReader( catPagesFile  ) );
+        //FileWriter outFile;
+       // FileWriter outFileCatNotFound;
+        FileWriter outFile = new FileWriter("F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\page_id_page_title_leaf_categories_page_less_than_90.txt", true);
+	
+        while ( ( line = fileReader.readLine() ) != null )
+        {
+            if ( !line.isEmpty() )
+            {
+                String splitLine[]= line.split("\t");
+           int pageId=  PageDB.getPageId( splitLine[1].trim() );
+          outFile.append( pageId +"\t"+splitLine[1].trim()+"\n" );
+           //  CategoryLinksDB.getCategoryByPageID( );
+                
+            
+            }
+        }
+        
+        outFile.close();
+           
+        */
+        
+     
+       // CategoryDB.getCategoryByName();
+        /*
+          File uniqueCatNamesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_not_found_in_category_table_ca_replaced_part_3.txt" );
+        String line;
+        BufferedReader fileReader;
+        fileReader = new BufferedReader( new FileReader( uniqueCatNamesFile ) );
+        //FileWriter outFile;
+       // FileWriter outFileCatNotFound;
+       
+        while ( ( line = fileReader.readLine() ) != null )
+        {
+            if ( !line.isEmpty() )
+            {
+               // CategoryDB.getCategoryDirectedByArticlePage(line);
+                CategoryDB.getCategoryByName(line);
+            }
+        }
+        
+        */
+        
+       /* 
+        System.out.println("Threshold \t" +"Page Count");
+        // TODO code application logic here
+       	    	
+			 
+for(int i=1; i<100000; i++){
+     FileWriter outFile = new FileWriter("F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\page_threshold_values.txt", true);
+		
+    int pageCount= CategoryDB.getCategoryPageCount( i );
+     outFile.append(i+"\t" +pageCount+"\n");
+   // System.out.println(i+"\t" +pageCount);
+   
+       outFile.close();
+}
+   */     
+        /*
+        
+        Scanner fileScanner = null;
+        Scanner childFileScanner= null;
+         Scanner  parentFileScanner= null;
+        try
+        {
+            
+            //fileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\preview.txt" ) ).useDelimiter("\\>*.\\<*");
+       // fileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\preview.txt" ) );
+       fileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\dbpedia_categories\\article_categories_en.nt" ) );
+      
+           DataProcesor.inserDataToDB( fileScanner );
+           parentFileScanner = new Scanner( new File(  "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\parents.txt" ));
+    childFileScanner = new Scanner( new File(  "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\all_children.txt") );
+     
+        Node.sortChildren( parentFileScanner, childFileScanner );
+           
+        } catch ( FileNotFoundException e )
+        {
+            e.printStackTrace();
+        }
+        
+        */
+        //read category file and insert data to the database 
+    
+        
+        //read leaf node file and update the database
+        
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DB_connection.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DB_connection.java
new file mode 100644
index 0000000..3add8d0
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DB_connection.java
@@ -0,0 +1,30 @@
+/*
+ * DO NOT MODIFY THIS FILE (it is already completed and should not be changed).
+ */
+
+package org.dbpedia.kasun.categoryprocessor;
+import java.sql.*;
+
+public class DB_connection {
+	public DB_connection() {};
+  //  "jdbc:mysql://localhost:3306/TweetComparison","root","nbuser"
+	//public Connection dbConnect(String db_connect_string, String db_userid, String db_password) {
+    public Connection dbConnect() {
+		
+		Connection conn = null;
+	    try {
+			Class.forName("com.mysql.jdbc.Driver").newInstance();
+	         //conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/kasun","kasun","kasun_perrera_kk");
+            conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/wiki_categories","root","nbuser");
+		} catch (InstantiationException e) {
+			e.printStackTrace();
+		} catch (IllegalAccessException e) {
+			e.printStackTrace();
+		} catch (ClassNotFoundException e) {
+			e.printStackTrace();
+		} catch (SQLException e) {
+			e.printStackTrace();
+		}
+	    return conn;
+	}
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java
new file mode 100644
index 0000000..c1c10eb
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java
@@ -0,0 +1,77 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ * 
+ *      Date             Author          Changes 
+ *      Jun 29, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Scanner;
+
+
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+public class DataProcesor {
+
+    public static void inserDataToDB(Scanner fileScanner) throws IOException{
+        
+        FileWriter outFile1 ;
+        FileWriter outFile2 ;
+        String line;
+        while ( fileScanner.hasNextLine() )
+        {
+           // System.out.println(fileScanner.nextLine());
+            //split the line by space, will get triples separated 
+           line=fileScanner.nextLine();
+            String[] typle=line.split("\\ ");
+           int parentId;
+           int childId;
+            
+            if(!typle[0].trim().equals("#")&&typle.length>2){
+                //<http://dbpedia.org/resource/ETA> begin index=28
+                String parent= typle[0].substring( 29, typle[0].length()-1 );
+                //<http://dbpedia.org/resource/Category:United_Kingdom_Home_Office_designated_terrorist_groups>
+                String child= typle[2].substring( 38, typle[2].length()-1 );
+           //  System.out.println( "Line: " +line);  
+             //    System.out.println( "Parent: "+parent+" "+"child: "+ child );
+                  outFile1 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\parents.txt", true );
+    outFile2 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\all_children.txt", true );
+      
+                 
+                 //insert parent and child to the node- duplicate enties are handle by the SQL 
+               //  NodeDB.insertNode( parent );
+                 
+                 outFile1.append(parent+"\n");
+                // NodeDB.insertNode( child);
+                 outFile2.append(child+"\n");
+                 //get child and parent  Ids
+                 parentId=NodeDB.getCategoryId( parent );
+                 childId= NodeDB.getCategoryId( child);
+                 
+                 //
+                 EdgeDB.insertEdge( parentId, childId );
+                 
+   outFile1.close();
+        outFile2.close();
+            }
+            
+            
+        }
+     
+    } 
+    
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/EdgeDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/EdgeDB.java
new file mode 100644
index 0000000..c706c59
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/EdgeDB.java
@@ -0,0 +1,203 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ * 
+ *      Date             Author          Changes 
+ *      Jun 29, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+
+
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+public class EdgeDB {
+    
+    public static void insertEdge(int parentId, int chidId){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query = "INSERT IGNORE INTO edges(parent_id,child_id) VALUES (?, ?)";
+
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt(1, parentId);
+            ps.setInt(2, chidId);
+            updateQuery = ps.executeUpdate();
+           
+//            while (rs.next())
+//            {
+//            }
+            
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+           // return null;
+        }
+        
+    }
+
+    public static ArrayList<Integer> getChildren(int parenId){
+       
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+          //TO-DO rewrite the query
+         String query =  "SELECT child_id FROM edges WHERE parent_id=?";
+
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt( 1, parenId);
+           
+             rs = ps.executeQuery();
+             
+             
+          ArrayList<Integer> childrenList= new ArrayList<Integer>();
+           
+           
+            while (rs.next())
+            {
+                childrenList.add(rs.getInt("child_id"));
+           }
+             return childrenList;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return null;
+        }
+        
+        
+    }
+    public static ArrayList<Integer> getParent(int leafNode){
+       
+        
+         DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+          //TO-DO rewrite the query
+         String query =  "SELECT parent_id FROM edges WHERE child_id =?";
+
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt( 1, leafNode);
+           
+             rs = ps.executeQuery();
+             
+             
+          ArrayList<Integer> parents= new ArrayList<Integer>();
+           
+           
+            while (rs.next())
+            {
+                parents.add(rs.getInt(1));
+           }
+             return parents;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return null;
+        }
+        
+        
+        
+        
+       
+    }
+    
+    public static ArrayList<Integer>  getChilren(int parentId){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query =  "select parent_id,child_id from edges where parent_id=?";
+
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt(1, parentId);
+           
+             rs = ps.executeQuery();
+           ArrayList<Integer> chidId= new  ArrayList<Integer>();
+            while (rs.next())
+            {
+                chidId.add(rs.getInt("child_id") );
+           }
+            return chidId;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return null;
+        }
+        
+    }
+    
+     public static ArrayList<Integer>  getDisinctleafNodes(){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query =  "SELECT  distinct `child_id` FROM edges WHERE  `child_id` NOT IN (SELECT `parent_id` FROM edges )";
+
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+          //  ps.setInt(1, parentId);
+           
+             rs = ps.executeQuery();
+           ArrayList<Integer> leafId= new  ArrayList<Integer>();
+            while (rs.next())
+            {
+                leafId.add(rs.getInt("child_id") );
+           }
+            return leafId;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return null;
+        }
+        
+    }
+    
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Edges.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Edges.java
new file mode 100644
index 0000000..8542932
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Edges.java
@@ -0,0 +1,122 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ * Date Author Changes Jun 29, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.*;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class Edges
+{
+
+    ArrayList<Integer> leafNodes = new ArrayList<Integer>();
+
+    private int parentId;
+
+    private int childId;
+
+    public int getChildId()
+    {
+        return this.childId;
+    }
+
+    public int getParentId()
+    {
+        return this.parentId;
+    }
+
+    public void setParentId( int parentId )
+    {
+        this.parentId = parentId;
+    }
+
+    public void setChildId( int childId )
+    {
+        this.childId = childId;
+    }
+
+    public void findProminetNodes( ) throws IOException
+    {
+         // input leaf nodelit as a file to enhance memoery useage    
+        //all leaf nodes
+
+        HashSet<Integer> prominetNodeList= new HashSet<Integer>();
+
+       
+            //get all leaf nodes
+            leafNodes=EdgeDB.getDisinctleafNodes();
+
+
+        //creating a clode of leafnodes
+        ArrayList<Integer> leafNodesClone = new ArrayList<Integer>( leafNodes.size() );
+        for ( Integer p : leafNodes )
+        {
+            leafNodesClone.add( p );
+        }
+
+
+        for ( int i = 0; i < leafNodes.size(); i++ )
+        {
+            
+            //to check whether leaf becomes prominet node
+            boolean isLeafProminent=true;
+
+            //To-Do here need to remove the leaf nodes added from the arry list 
+
+            //get parents of the selected leafnode(there could be one or more parents)
+            ArrayList<Integer> parentId = EdgeDB.getParent( leafNodes.get( i ) );
+
+            for ( int j = 0; j < parentId.size(); j++ )
+            {
+                //get the children of parent node and check all children are leaf nodes
+                ArrayList<Integer> childnodes = EdgeDB.getChildren( parentId.get( j ) );
+
+                //boolean prominentNode = isProminent( childnodes );
+                //check whether all children are leafs
+                if(isLeaf( childnodes )){
+                    
+                    //duplicates automatically removed
+                    prominetNodeList.add( parentId.get( j ) );
+                    isLeafProminent=false;
+                    
+                }
+            }
+            
+            if(isLeafProminent){
+                prominetNodeList.add( leafNodes.get( i ) );
+            }
+        }
+        
+        
+     //  FileWriter outFile4 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\promiment_nodes.txt", true );
+         //insert this in to the database
+    NodeDB.updateProminetNode(prominetNodeList );
+
+
+    }
+
+    private boolean isLeaf( ArrayList<Integer> childnodes )
+    {
+        boolean status = true;
+        for ( int k = 0; k < childnodes.size(); k++ )
+        {
+            if ( !leafNodes.contains(childnodes.get( k ) ) )
+            {
+                status = false;
+                break;
+            }
+        }
+        return status;
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Node.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Node.java
new file mode 100644
index 0000000..595b8a5
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Node.java
@@ -0,0 +1,133 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Jun 29, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Scanner;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class Node
+{
+
+    private int nodeId;
+
+    private String categoryName;
+
+    private boolean isProminent;
+
+    private boolean isLeaf;
+
+    private double scoreInterLangu;
+
+    private double scoreEditHisto;
+
+    public void setNodeId( int nodeId )
+    {
+        this.nodeId = nodeId;
+    }
+
+    public void setCategoryName( String catName )
+    {
+        this.categoryName = catName;
+    }
+
+    public void setIsProminent( boolean value )
+    {
+        this.isProminent = value;
+    }
+
+    public void setIsLeaf( boolean value )
+    {
+        this.isLeaf = value;
+    }
+    
+    public void setScoreInterlangu(double score){
+        this.scoreInterLangu=score;
+    }
+    
+    public void setScoreEditHisto(double score){
+        this.scoreEditHisto=score;
+    }
+
+    public int getNodeId()
+    {
+        return this.nodeId;
+    }
+
+    public String getCategoryName()
+    {
+        return this.categoryName;
+    }
+    
+    public boolean getIsProminent()
+    {
+        return this.isProminent;
+    }
+
+    public boolean getIsLeaf()
+    {
+        return this.isLeaf;
+    }
+    
+    public double getScoreInterlangu(){
+        return this.scoreInterLangu;
+    }
+    
+    public double getScoreEditHisto(){
+        return this.scoreEditHisto;
+    }
+    
+        public static void sortChildren(Scanner parentFileScanner,Scanner childFileScanner) throws IOException{
+        String line;
+        
+        //TO-DO use a HashSet for this
+        HashMap<String,String> parentMap=  new HashMap<String, String>();
+         HashMap<String,String> childMap=  new HashMap<String, String>();
+        while ( parentFileScanner.hasNextLine() )
+        {
+           // System.out.println(fileScanner.nextLine());
+            //split the line by space, will get triples separated 
+           line=parentFileScanner.nextLine();
+           parentMap.put( line, line );
+        }
+        
+         while ( childFileScanner.hasNextLine() )
+        {
+           // System.out.println(fileScanner.nextLine());
+            //split the line by space, will get triples separated 
+           line=childFileScanner.nextLine();
+           childMap.put( line, line );
+        }
+         
+         for(Map.Entry entry : parentMap.entrySet()){
+             
+            if(childMap.containsKey( (String)entry.getKey() ) ){
+                childMap.remove((String)entry.getKey());
+            }
+             
+         }
+         
+         FileWriter outFile3 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\leaf_nodes.txt", true );
+    
+         for(Map.Entry entry : childMap.entrySet()){
+             //TO_DO write this data to the database
+             outFile3.append((String)entry.getKey()+"\n");
+         }
+        outFile3.close();
+        
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java
new file mode 100644
index 0000000..ca72e04
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java
@@ -0,0 +1,268 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ * 
+ *      Date             Author          Changes 
+ *      Jun 29, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.HashSet;
+
+
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+public class NodeDB {
+    
+        public static void insertNode( int nodeID, String categoryName){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query = "INSERT IGNORE INTO node(node_id,category_name,is_leaf,is_prominent) VALUES (?,?,?,?)";
+
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt( 1, nodeID);
+            ps.setString( 2, categoryName);
+            ps.setBoolean( 3, false);
+            ps.setBoolean( 4, false);
+            updateQuery = ps.executeUpdate();
+           
+//            while (rs.next())
+//            {
+//            }
+            
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+           // return null;
+        }
+        
+    }
+        
+        public static int getCategoryId(String cateName){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query =  "select node_id,category_name from node where category_name=?";
+
+ 
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setString( 1, cateName);
+           
+             rs = ps.executeQuery();
+          int nodeId=0;
+            while (rs.next())
+            {
+                nodeId=rs.getInt("node_id");
+           }
+            return nodeId;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return 0;
+        }
+        
+    }
+        
+        
+        public static String getCategoryName(int categoryId){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query =  "select category_name from node where node_id=?";
+
+ 
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt( 1, categoryId);
+           
+             rs = ps.executeQuery();
+          String nodeName = null;
+            while (rs.next())
+            {
+                nodeName=rs.getString( "category_name");
+           }
+            return nodeName;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return null;
+        }
+        
+    }
+        
+         public static void updateNode(ArrayList<String> categoryName){
+            
+          DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query = "UPDATE node SET is_leaf=? WHERE category_name=?";
+
+
+        try
+        {
+            for(int i=0; i<categoryName.size();i++){
+            ps = connection.prepareStatement(query);
+            ps.setBoolean( 1, true);
+            ps.setString( 2, categoryName.get( i ) );         
+            updateQuery = ps.executeUpdate();
+            }
+//            while (rs.next())
+//            {
+//            }
+            
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+           // return null;
+        }
+          
+        }
+
+         public static void updateLanguageScore(Double score, int nodeID){
+            
+          DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query = "UPDATE node SET score_interlang=? WHERE node_id=?";
+
+
+        try
+        {
+            
+            ps = connection.prepareStatement(query);
+            ps.setDouble( 1, score);
+            ps.setInt( 2, nodeID );         
+            updateQuery = ps.executeUpdate();
+            
+//            while (rs.next())
+//            {
+//            }
+            
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+           // return null;
+        }
+          
+        }
+
+         
+        
+        public static void updateProminetNode(HashSet<Integer> prominentNodes){
+            
+          DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query = "UPDATE node SET is_prominent=? WHERE node_id=?";
+
+
+        try
+        {
+            for (Integer i : prominentNodes) {
+            ps = connection.prepareStatement(query);
+            ps.setBoolean( 1, true);
+            ps.setInt( 2, i );         
+            updateQuery = ps.executeUpdate();
+            }
+//            while (rs.next())
+//            {
+//            }
+            
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+           // return null;
+        }
+          
+        }
+
+        public static ArrayList<String> getCategoriesByHead( String head )
+    {
+        ArrayList<String> categoryList =new ArrayList<String>();
+        
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query =  "select node_id, category_name from node where head_of_name=?";
+
+ 
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setString( 1, head);
+           
+             rs = ps.executeQuery();
+         
+            while (rs.next())
+            {
+                categoryList.add( rs.getString( "category_name") );
+                //nodeName=rs.getString( "category_name");
+           }
+             connection.close();
+            return categoryList;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return null;
+        }
+        
+        
+    }
+        
+    static void updateProminetNode( Integer s )
+    {
+        throw new UnsupportedOperationException( "Not yet implemented" );
+    }
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Page.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Page.java
new file mode 100644
index 0000000..e7e9cd0
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Page.java
@@ -0,0 +1,51 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Sep 17, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.categoryprocessor;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class Page
+{
+
+    private int pageId;
+
+    private String pageName;
+
+    private int pageNameSpace;
+
+    public void setPageID( int pageID )
+    {
+        this.pageId = pageID;
+    }
+
+    public void setPageNameSapce( int pageNameSpace )
+    {
+        this.pageNameSpace = pageNameSpace;
+    }
+
+    public void setPageName( String pageName )
+    {
+        this.pageName = pageName;
+    }
+    
+    public int getPageID(){
+        return  this.pageId;
+    }
+    
+    public int getPageNamespace(){
+        return this.pageNameSpace;
+    }
+    
+        public String getPageName(){
+        return this.pageName;
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/PageDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/PageDB.java
new file mode 100644
index 0000000..e561ccc
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/PageDB.java
@@ -0,0 +1,296 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ *      Date             Author          Changes 
+ *      Aug 3, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.io.*;
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.IntField;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.NIOFSDirectory;
+import org.apache.lucene.util.Version;
+
+
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+public class PageDB {
+    
+    public static boolean isArticlePage(int pageId){
+        boolean state= false;
+        
+         DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+     
+                
+                 String lineArr[];
+                PreparedStatement ps = null;
+                ResultSet rs = null;
+                int updateQuery = 0;
+             
+             
+                // System.out.println(line);
+                // System.out.println(temp);
+                String query = "SELECT page_namespace FROM `page` WHERE `page_id` = " + pageId ;
+
+
+                try
+                {
+                    ps = connection.prepareStatement( query );
+                    // ps.setString( 1, temp );
+                    //ps.setString( 1, catTitle );
+                    rs = ps.executeQuery();
+                    int count = 0;
+
+                     while( rs.next() )
+                    {
+                       if(rs.getInt( "page_namespace" )== 0 ){
+                                state=true;
+                                break;
+                            }
+                    }
+                  
+
+
+
+                   connection.close();
+                } catch ( SQLException e )
+                {
+                    e.printStackTrace();
+                    // return 0;
+                }
+   
+        
+        return state;
+    }
+    
+     public static int getPageId(String catPageTitle){
+        int resultId = 0;
+        
+         DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+     
+                
+                 String lineArr[];
+                PreparedStatement ps = null;
+                ResultSet rs = null;
+                int updateQuery = 0;
+             
+             
+                // System.out.println(line);
+                // System.out.println(temp);
+               
+                String query = "SELECT page_id FROM `category_only_page` WHERE `page_title` = '" + catPageTitle+"'" ;
+
+
+                try
+                {
+                    ps = connection.prepareStatement( query );
+                    // ps.setString( 1, temp );
+                    //ps.setString( 1, catTitle );
+                    rs = ps.executeQuery();
+                    int count = 0;
+
+                     while( rs.next() )
+                    {
+                    resultId= rs.getInt("page_id");
+                    }
+                  
+
+
+
+                   connection.close();
+                } catch ( SQLException e )
+                {
+                    e.printStackTrace();
+                    // return 0;
+                }
+   
+        
+        return resultId;
+    }
+      public static HashMap<String,Integer> getAllPages() throws IOException{
+        int resultId = 0;
+        
+         DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+     HashMap<String,Integer> pagesMap= new HashMap<String,Integer>();
+                
+                 String lineArr[];
+                PreparedStatement ps = null;
+                ResultSet rs = null;
+                int updateQuery = 0;
+             
+             
+                // System.out.println(line);
+                // System.out.println(temp);
+               
+                String query = "SELECT page_id, page_title FROM `category_only_page`" ;
+
+
+                try
+                {
+                    
+                                      
+       String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\categoty_page_candidate_index";
+        int noOfDocs = 0;
+
+        IndexWriter iW;
+       
+            NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) );
+            //dir = new RAMDirectory() ;
+            iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) );
+
+                    ps = connection.prepareStatement( query );
+                    // ps.setString( 1, temp );
+                    //ps.setString( 1, catTitle );
+                    rs = ps.executeQuery();
+                    int count = 0;
+
+                     while( rs.next() )
+                    {
+      
+
+
+                    Document doc = new Document();
+
+
+
+
+                    doc.add( new TextField( "page_title",  rs.getString( "page_title" ), Field.Store.YES ) );
+                    doc.add( new IntField( "page_id", rs.getInt("page_id"), Field.Store.YES ) );
+                
+                    iW.addDocument( doc );
+                
+
+      
+                        
+                    //    pagesMap.put( rs.getString( "page_title" ), rs.getInt("page_id") );
+             //   System.out.println(pagesMap.size());
+                        //   resultId= rs.getInt("page_id");
+                    }
+                    iW.close();
+            dir.close();
+
+
+
+                   connection.close();
+                } catch ( SQLException e )
+                {
+                    e.printStackTrace();
+                    // return 0;
+                }
+   
+        
+        return pagesMap;
+    }
+     
+    
+            public static void insertCategoryPage( String data){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+          int updateQuery = 0;
+        
+         String query = "INSERT IGNORE INTO category_only_page(page_id,page_namespace,page_title,page_restrictions, page_counter,page_is_redirect, page_is_new, page_random, page_touched,page_latest,page_len) VALUES ("+data+")";
+
+
+         
+
+        try
+        {
+            ps = connection.prepareStatement(query);
+ 
+            updateQuery = ps.executeUpdate();
+           
+//            while (rs.next())
+//            {
+//            }
+            
+         }
+        catch(SQLException e)
+        {
+            
+            System.out.println(data);
+           // e.printStackTrace();
+           // return null;
+        }
+        
+    }
+            
+            public static Page getPagebyID( int pageId )
+    {
+        
+         DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+
+                PreparedStatement ps = null;
+                ResultSet rs = null;
+                int updateQuery = 0;
+             Page page= new Page();
+             
+                // System.out.println(line);
+                // System.out.println(temp);
+               
+                String query = "SELECT page_id,page_namespace,page_title FROM page WHERE page_id = ?" ;
+
+
+                try
+                {
+                    ps = connection.prepareStatement( query );
+                    ps.setInt( 1, pageId );
+                    //ps.setString( 1, catTitle );
+                    rs = ps.executeQuery();
+               
+                     while( rs.next() )
+                    {
+                        
+                    page.setPageID(rs.getInt("page_id"));
+                    page.setPageNameSapce( rs.getInt("page_namespace"));
+                    page.setPageName( rs.getString("page_title") );
+                    
+                    }
+                  
+
+
+
+                   connection.close();
+                } catch ( SQLException e )
+                {
+                    e.printStackTrace();
+                    // return 0;
+                }
+   
+        
+        return page;
+    }
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/languageLinksDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/languageLinksDB.java
new file mode 100644
index 0000000..d339297
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/languageLinksDB.java
@@ -0,0 +1,64 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ * 
+ *      Date             Author          Changes 
+ *      Aug 31, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.categoryprocessor;
+
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+
+
+
+/**
+ * Communications with the languagelinks table
+ * 
+ */
+public class languageLinksDB {
+    
+    public static int getLanguageLinksCount(int pageId){
+        DB_connection con = new DB_connection();
+        Connection connection = con.dbConnect();
+        PreparedStatement ps = null;
+        ResultSet rs = null;
+        //  int updateQuery = 0;
+        
+         String query =  "select count(*) from langlinks where ll_from=?";
+
+ 
+        try
+        {
+            ps = connection.prepareStatement(query);
+            ps.setInt( 1, pageId);
+           
+             rs = ps.executeQuery();
+          int nodeId=0;
+            while (rs.next())
+            {
+                nodeId=rs.getInt(1);
+           }
+            return nodeId;
+         }
+        catch(SQLException e)
+        {
+            e.printStackTrace();
+            return 0;
+        }
+        
+    }
+    
+    
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DataExtractor.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DataExtractor.java
new file mode 100644
index 0000000..10c7e63
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DataExtractor.java
@@ -0,0 +1,72 @@
+
+
+
+/** 
+ *
+ * 
+ *      Date             Author          Changes 
+ *      Jul 16, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.extractdata;
+
+
+import java.io.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+
+
+/**
+ * Methods of this class extract data from the Wikipedia SQL dumps
+ * 
+ */
+public class DataExtractor {
+
+    
+public static void main(String[] args ) throws FileNotFoundException, IOException{
+    String line;
+    
+    /*
+     * enwiki-20130604-page.sql- data line start at line #49
+     * enwiki-20130604-categorylinks.sql data line start at line #43
+     * enwiki-20130604-category.sql data line start at line #42
+     * enwiki-20130604-langlinks.sql data line start at line #39
+     * change "int count" variable according to the data line for each SQl dump file
+     */
+     File categoryLinksDumpFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\Wiki_Category_SQL_tables\\enwiki-20130604-langlinks.sql" );
+     File outCategoryLinksDumpFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-langlinks_typles.txt");
+    
+     BufferedReader  fileReader;
+    fileReader = new BufferedReader( new FileReader( categoryLinksDumpFile ) );
+ int count=0;
+        while ((line = fileReader.readLine())!=null )
+        {
+         //  System.out.println(line); 
+       
+             
+            if(count>=39){
+                FileWriter outFile2 = new FileWriter(outCategoryLinksDumpFile,true);
+                 //  System.out.println("#############################################################");
+                
+             String[] strArr = line.split("\\)\\,\\(");
+             for(int i=0;i< strArr.length;i++){
+                 if(i==0){
+                     String[] strArr2= strArr[0].split("\\(",2) ;
+                     outFile2.append(strArr2[1]+"\n");
+                        //System.out.println( strArr2[1]);  
+                 }
+                 else{
+                       outFile2.append(strArr[i]+"\n");
+               //  System.out.println( strArr[i]); 
+                 }
+             }
+            outFile2.close();
+           }
+         //   String[] strArr = line.split( "\t" );
+            count++;
+            
+        }
+}
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DoSearch.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DoSearch.java
new file mode 100644
index 0000000..d8863a3
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DoSearch.java
@@ -0,0 +1,98 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Jul 18, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.extractdata;
+
+
+import java.io.*;
+import org.dbpedia.kasun.indexer.Index;
+import org.dbpedia.kasun.searcher.Search;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class DoSearch
+{
+
+    public static void main( String[] args ) throws IOException, Exception
+    {
+        //page
+        // String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index";
+        /*
+         * categorylinks
+         * 
+         */
+      //  String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\category";
+// File categoryTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" );
+ 
+        /*
+         * languagelinks
+         */
+        
+        /*
+        String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\language_links";
+        File tuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-langlinks_typles.txt" );
+          Index.indexInterLanguageLinks( pathToIndex, tuplesFile );
+        */
+        /*
+         * category_page_links_view
+         */
+          String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\category_page_links_view";
+        File tuplesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_dir\\category_page_links_view\\page_id_cl_to.txt" );
+          Index.indexCategoryPageLinksView( pathToIndex, tuplesFile );
+          
+          
+         
+          
+          
+        //Index.indexCategory( pathToIndex, categoryTuplesFile );
+        
+/*
+        
+         String pathToIndex1 ="C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index1"; 
+         String pathToIndex2= "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index2"; 
+         String pathToIndex3 = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index3";
+         String pathToIndex4 ="C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index4"; 
+         String pathToIndex5= "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index5"; 
+         String pathToIndex6 = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index6";
+         
+        //page tuples
+         File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-page_typles.txt" );File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-page_typles.txt" );
+
+
+         
+        //  File ctLinksTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-categorylinks_typles.txt" );
+
+
+       // Index.indexPage(pathToIndex1,pathToIndex2,pathToIndex3,pathToIndex4,pathToIndex5,pathToIndex6,pageTuplesFile);
+//Index.indexPage2(pageTuplesFile);
+
+        //  Index.indexCategoryLinks( pathToIndex, ctLinksTuplesFile );
+
+
+        // FileWriter outFile = new FileWriter("C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0.txt",true);
+
+        /*
+         * String page_q="0"; String page_field="page_namespace"; Search.searchPage( new File(pathToIndex),page_q,
+         * page_field,25000000);
+         */
+
+
+       
+       // String cateLinksField = "cl_from";
+
+
+       //  Search.searchCategoryLinks( new File( pathToIndex ), cateLinksField, 200 );
+           
+ String cateLinksField = "cat_title";
+ //Search.searchCategory( new File( pathToIndex ), cateLinksField, 2 );
+
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/Mqlread.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/Mqlread.java
new file mode 100644
index 0000000..1c3ffe6
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/Mqlread.java
@@ -0,0 +1,74 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ *      Date             Author          Changes 
+ *      Sep 16, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.freebasequery;
+
+
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+import com.google.api.client.http.GenericUrl;
+import com.google.api.client.http.HttpRequest;
+import com.google.api.client.http.HttpRequestFactory;
+import com.google.api.client.http.HttpResponse;
+import com.google.api.client.http.HttpTransport;
+import com.google.api.client.http.javanet.NetHttpTransport;
+import com.jayway.jsonpath.JsonPath;
+import java.io.FileInputStream;
+import java.util.Properties;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+
+public class Mqlread {
+  public static Properties properties = new Properties();
+  public static void main(String[] args) {
+   String curcer=curcerQuery("");
+   while(curcer!="FALSE"){
+       curcer=curcerQuery(curcer);
+   }
+  }
+  
+  private static String  curcerQuery(String curcer){
+     
+      String newCurcer = null;
+       try {
+     // properties.load(new FileInputStream("freebase.properties"));
+      HttpTransport httpTransport = new NetHttpTransport();
+      HttpRequestFactory requestFactory = httpTransport.createRequestFactory();
+      JSONParser parser = new JSONParser();
+      String query = "[{\"id\":null,\"name\":null,\"type\":\"/people/person\",\"limit\":100}]";
+      GenericUrl url = new GenericUrl("https://www.googleapis.com/freebase/v1/mqlread");
+      url.put("query", query);
+     // url.put("key", properties.get("API_KEY"));
+      url.put("key","AIzaSyDcHfGTZlVm0KE4KKK9JAM61KBDaXtPiJc");
+      url.put("cursor", curcer);
+      HttpRequest request = requestFactory.buildGetRequest(url);
+      HttpResponse httpResponse = request.execute();
+      JSONObject response = (JSONObject)parser.parse(httpResponse.parseAsString());
+      JSONArray results = (JSONArray)response.get("result");
+      newCurcer=(String)response.get("cursor");
+      
+      for (Object result : results) {
+        System.out.println(JsonPath.read(result,"$.name").toString());
+       // System.out.println( newCurcer);
+      }
+    } catch (Exception ex) {
+      ex.printStackTrace();
+    }
+      
+      return newCurcer;
+  }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/QueryFB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/QueryFB.java
new file mode 100644
index 0000000..54b142d
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/QueryFB.java
@@ -0,0 +1,71 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ * 
+ *      Date             Author          Changes 
+ *      Sep 16, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.freebasequery;
+
+import com.google.api.client.http.GenericUrl;
+import com.google.api.client.http.HttpRequest;
+import com.google.api.client.http.HttpRequestFactory;
+import com.google.api.client.http.HttpResponse;
+import com.google.api.client.http.HttpTransport;
+import com.google.api.client.http.javanet.NetHttpTransport;
+import com.jayway.jsonpath.JsonPath;
+import java.io.FileInputStream;
+import java.util.Properties;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+public class QueryFB {
+    
+     public static Properties properties = new Properties();
+  public static void main(String[] args) {
+      int count = 0;
+    try {
+     // properties.load(new FileInputStream("freebase.properties"));
+      HttpTransport httpTransport = new NetHttpTransport();
+      HttpRequestFactory requestFactory = httpTransport.createRequestFactory();
+      JSONParser parser = new JSONParser();
+      GenericUrl url = new GenericUrl("https://www.googleapis.com/freebase/v1/search");
+     // url.put("query", "Cee Lo Green");
+      //url.put("filter", "(all type:/music/artist created:\"The Lady Killer\")");
+      // url.put("filter", "(all type:/people/person)");
+     //   url.put("filter", "(all type:/location/location)");
+      url.put("filter", "(all type:/organization/organization)");
+        
+   
+        url.put("cursor", "0");
+      url.put("limit", "160");
+      url.put("indent", "true");
+     // url.put("key", properties.get("API_KEY"));
+       url.put("key","AIzaSyDcHfGTZlVm0KE4KKK9JAM61KBDaXtPiJc");
+      HttpRequest request = requestFactory.buildGetRequest(url);
+      HttpResponse httpResponse = request.execute();
+      JSONObject response = (JSONObject)parser.parse(httpResponse.parseAsString());
+      JSONArray results = (JSONArray)response.get("result");
+      for (Object result : results) {
+          count++;
+        System.out.println(JsonPath.read(result,"$.name").toString());
+      }
+       System.out.println("total: "+ count);
+    } catch (Exception ex) {
+      ex.printStackTrace();
+    }
+  }
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/indexer/Index.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/indexer/Index.java
new file mode 100644
index 0000000..96d0f2f
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/indexer/Index.java
@@ -0,0 +1,415 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Jul 17, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.indexer;
+
+
+import java.io.*;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+//import org.apache.lucene.analysis.;
+import org.apache.lucene.document.*;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.NIOFSDirectory;
+import org.apache.lucene.util.Version;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class Index
+{
+
+    public static void indexPage( String pathToIndex, File pageTuplesFile ) throws IOException
+    {
+        int noOfDocs = 0;
+
+        IndexWriter iW;
+
+        try
+        {
+            // NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) );
+            //dir = new RAMDirectory() ;
+            // iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) );
+
+
+
+
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( pageTuplesFile ) );
+            int count = 0;
+            String line;
+            FileWriter outFile;
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+
+                String[] strArr = line.split( "\\," );
+                if ( strArr.length >= 3 )
+                {
+//                StringReader page_id = new StringReader( strArr[0] );
+//                StringReader page_namespace = new StringReader( strArr[1] );
+//                StringReader page_title = new StringReader( strArr[2] );
+                    //System.out.println(strArr[0]+strArr[1]+strArr[2]);
+
+                    if ( strArr[1].trim() == "0" )
+                    {
+                        outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_14_new.txt", true );
+
+
+                        outFile.append( strArr[0] + "\t" + strArr[1] + "\t" + strArr[2] + "\n" );
+                        //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title"));
+                        outFile.close();
+                    }
+                    /*
+                     * Document doc = new Document();
+                     *
+                     * doc.add( new TextField( "page_id", strArr[0], Field.Store.YES ) ); doc.add( new TextField(
+                     * "page_namespace", strArr[1], Field.Store.YES ) ); doc.add( new TextField( "page_title",
+                     * strArr[2], Field.Store.YES ) );
+                     *
+                     *
+                     * iW.addDocument( doc );
+                     */
+
+                } else
+                {
+                    System.out.println( line + "\n" );
+                }
+
+                count++;
+            }
+
+
+            // iW.close();
+            //  dir.close();
+
+        } catch ( CorruptIndexException e )
+        {
+            e.printStackTrace();
+        } catch ( IOException e )
+        {
+            e.printStackTrace();
+        }
+    }
+
+    public static void readPageTable( File pageTuplesFile ) throws IOException
+    {
+        int noOfDocs = 0;
+
+        IndexWriter iW;
+
+        try
+        {
+
+
+
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( pageTuplesFile ) );
+            int count = 0;
+            String line;
+            FileWriter outFile;
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+
+                String[] strArr = line.split( "\\," );
+                if ( strArr.length >= 3 )
+                {
+
+
+                    if ( strArr[1].trim().equals( "0" ) )
+                    {
+                        outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0_new.txt", true );
+
+
+                        outFile.append( strArr[0] + "\t" + strArr[1] + "\t" + strArr[2] + "\n" );
+                        //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title"));
+                        outFile.close();
+                    }
+
+
+                } else
+                {
+                    System.out.println( line + "\n" );
+                }
+
+                count++;
+            }
+
+
+
+        } catch ( CorruptIndexException e )
+        {
+            e.printStackTrace();
+        } catch ( IOException e )
+        {
+            e.printStackTrace();
+        }
+    }
+
+    public static void indexCategoryLinks( String pathToIndex, File tuplesFile ) throws IOException
+    {
+        //String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\page_index";
+        int noOfDocs = 0;
+
+        IndexWriter iW;
+        try
+        {
+            NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) );
+            //dir = new RAMDirectory() ;
+            iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) );
+
+
+          //  File tuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-langlinks_typles.txt" );
+
+
+
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( tuplesFile ) );
+            int count = 0;
+            String line;
+
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+
+                String[] strArr = line.split( "\\," );
+                //`cl_from` ,`cl_to`,`cl_sortkey`,`cl_timestamp`,`cl_sortkey_prefix`,`cl_collation`,`cl_type` enum('page','subcat','file') NOT NULL DEFAULT 'page',
+
+                if ( strArr.length >= 7 )
+                {
+
+                    Document doc = new Document();
+
+
+
+
+                    doc.add( new TextField( "cl_from", strArr[0], Field.Store.YES ) );
+                    doc.add( new TextField( "cl_to", strArr[1], Field.Store.YES ) );
+                    doc.add( new TextField( " cl_sortkey", strArr[2], Field.Store.YES ) );
+
+                    doc.add( new TextField( "cl_type", strArr[6], Field.Store.YES ) );
+                    iW.addDocument( doc );
+                } else
+                {
+                    System.out.println( line + "\n" );
+                }
+            }
+
+
+            iW.close();
+            dir.close();
+        } catch ( CorruptIndexException e )
+        {
+            e.printStackTrace();
+        } catch ( IOException e )
+        {
+            e.printStackTrace();
+        }
+    }
+
+    public static void indexCategory( String pathToIndex, File tuplesFile ) throws IOException
+    {
+        //String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index";
+        int noOfDocs = 0;
+
+        IndexWriter iW;
+        try
+        {
+            NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) );
+            //dir = new RAMDirectory() ;
+            iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) );
+
+
+            //File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" );
+
+
+
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( tuplesFile ) );
+            int count = 0;
+            String line;
+
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+
+                String[] strArr = line.split( "\\," );
+                //Data in following order`cat_id`,`cat_title`,`cat_pages`,`cat_subcats` 
+                //we need 0,1,2,3 elements of the string
+                if ( strArr.length >= 2)
+                {
+
+                  //  System.out.println(strArr[0]+"####"+strArr[1]+"####"+strArr[2]+"#####"+strArr[3]+"###"+strArr[4]);
+                    Document doc = new Document();
+
+
+
+
+                    doc.add( new TextField( "cat_id", strArr[0], Field.Store.YES ) );
+                    doc.add( new TextField( "cat_title", strArr[1], Field.Store.YES ) );
+                  //  doc.add( new IntField( "cat_pages", Integer.parseInt( strArr[2].trim() ), Field.Store.YES ) );
+                  //  doc.add( new IntField( "cat_subcats", Integer.parseInt( strArr[3].trim() ), Field.Store.YES ) );
+                  //  doc.add( new IntField( "cat_files", Integer.parseInt( strArr[4].trim() ), Field.Store.YES ) );
+                  //  doc.add( new TextField( "cat_hidden", strArr[5].substring( 0,1), Field.Store.YES ) );
+
+
+
+                    iW.addDocument( doc );
+                } else
+                {
+                    System.out.println( line + "\n" );
+                }
+            }
+
+
+            iW.close();
+            dir.close();
+        } catch ( CorruptIndexException e )
+        {
+            e.printStackTrace();
+        } catch ( IOException e )
+        {
+            e.printStackTrace();
+        }
+    }
+    
+    
+    public static void indexCategoryPageLinksView( String pathToIndex, File tuplesFile ) throws IOException
+    {
+        //String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\language_links";
+        int noOfDocs = 0;
+
+        IndexWriter iW;
+        try
+        {
+            NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) );
+            //dir = new RAMDirectory() ;
+            iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) );
+
+
+            //File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" );
+
+
+
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( tuplesFile ) );
+            int count = 0;
+            String line;
+
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+
+                String[] strArr = line.split( "\\t",2 );
+                //Data in following order`cat_id`,`cat_title`,`cat_pages`,`cat_subcats` 
+                //we need 0,1,2,3 elements of the string
+                if ( strArr.length >= 2)
+                {
+
+                  //  System.out.println(strArr[0]+"####"+strArr[1]+"####"+strArr[2]+"#####"+strArr[3]+"###"+strArr[4]);
+                    Document doc = new Document();
+
+
+
+
+                    doc.add( new TextField( "page_id", strArr[0].trim(), Field.Store.YES ) );
+                    doc.add( new TextField( "page_title", strArr[1], Field.Store.YES ) );
+                 
+                  //  doc.add( new IntField( "cat_subcats", Integer.parseInt( strArr[3].trim() ), Field.Store.YES ) );
+                  //  doc.add( new IntField( "cat_files", Integer.parseInt( strArr[4].trim() ), Field.Store.YES ) );
+                  //  doc.add( new TextField( "cat_hidden", strArr[5].substring( 0,1), Field.Store.YES ) );
+
+
+
+                    iW.addDocument( doc );
+                } else
+                {
+                    System.out.println( line + "\n" );
+                }
+            }
+
+
+            iW.close();
+            dir.close();
+        } catch ( CorruptIndexException e )
+        {
+            e.printStackTrace();
+        } catch ( IOException e )
+        {
+            e.printStackTrace();
+        }
+    }
+    
+    
+    
+      public static void indexInterLanguageLinks( String pathToIndex, File tuplesFile ) throws IOException
+    {
+        //String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\language_links";
+        int noOfDocs = 0;
+
+        IndexWriter iW;
+        try
+        {
+            NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) );
+            //dir = new RAMDirectory() ;
+            iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) );
+
+
+            //File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" );
+
+
+
+            BufferedReader fileReader;
+            fileReader = new BufferedReader( new FileReader( tuplesFile ) );
+            int count = 0;
+            String line;
+
+            while ( ( line = fileReader.readLine() ) != null )
+            {
+
+                String[] strArr = line.split( "\\,",3 );
+                //Data in following order`cat_id`,`cat_title`,`cat_pages`,`cat_subcats` 
+                //we need 0,1,2,3 elements of the string
+                if ( strArr.length >= 3)
+                {
+
+                  //  System.out.println(strArr[0]+"####"+strArr[1]+"####"+strArr[2]+"#####"+strArr[3]+"###"+strArr[4]);
+                    Document doc = new Document();
+
+
+
+
+                    doc.add( new TextField( "ll_from", strArr[0].trim(), Field.Store.YES ) );
+                    doc.add( new TextField( "ll_lang", strArr[1], Field.Store.YES ) );
+                  doc.add( new TextField( "ll_title", strArr[2] , Field.Store.YES ) );
+                  //  doc.add( new IntField( "cat_subcats", Integer.parseInt( strArr[3].trim() ), Field.Store.YES ) );
+                  //  doc.add( new IntField( "cat_files", Integer.parseInt( strArr[4].trim() ), Field.Store.YES ) );
+                  //  doc.add( new TextField( "cat_hidden", strArr[5].substring( 0,1), Field.Store.YES ) );
+
+
+
+                    iW.addDocument( doc );
+                } else
+                {
+                    System.out.println( line + "\n" );
+                }
+            }
+
+
+            iW.close();
+            dir.close();
+        } catch ( CorruptIndexException e )
+        {
+            e.printStackTrace();
+        } catch ( IOException e )
+        {
+            e.printStackTrace();
+        }
+    }
+    
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/rdf/RdfGenarator.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/rdf/RdfGenarator.java
new file mode 100644
index 0000000..cc56658
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/rdf/RdfGenarator.java
@@ -0,0 +1,143 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Sep 17, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.rdf;
+
+
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.dbpedia.kasun.categoryprocessor.CategoryLinksDB;
+import org.dbpedia.kasun.categoryprocessor.NodeDB;
+import org.dbpedia.kasun.categoryprocessor.Page;
+import org.dbpedia.kasun.categoryprocessor.PageDB;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class RdfGenarator
+{
+
+    private static String promintNodeName;
+
+    public static void getCategoriesForHead( String head )
+    {
+
+        ArrayList<String> categoriesForHead = NodeDB.getCategoriesByHead( head );
+
+for(int j=0; j<categoriesForHead.size();j++){
+    promintNodeName=categoriesForHead.get( j );
+    getPagesForCategory( promintNodeName );
+}
+categoriesForHead.clear();
+
+    }
+
+    public static void getPagesForCategory( String catName )
+    {
+        ArrayList<Integer> clFromPageID = CategoryLinksDB.getPagesLinkedByCatName( catName );
+        FileWriter outfile;
+
+        for ( int i = 0; i < clFromPageID.size(); i++ )
+        {
+
+            try
+            {
+                Page page = PageDB.getPagebyID( clFromPageID.get( i ) );
+                if ( page.getPageNamespace() == 0 )
+                {
+                    //namespace==0 means it's a article page
+                    outfile = new FileWriter( "/home/kasun/rdfresult/rdfoutput.txt", true );
+                    outfile.append( "<" + page.getPageName() + "> rdf:type <" + promintNodeName + "> \n" );
+                    outfile.close();
+                } else
+                {
+                    if ( page.getPageNamespace() == 14 )
+                    {
+
+                        //namespace==14 means it's a categorypage recurcive the categorypage
+                        //recursion causes segmentation error go for only fist child
+                       // getPagesForCategory( page.getPageName() );
+                        getPagesForCategoryFirstChild( page.getPageName() );
+                    }
+                }
+            } catch ( IOException ex )
+            {
+               FileWriter errorfile;
+                try
+                {
+                    errorfile = new FileWriter( "/home/kasun/rdfresult/error.txt", true );
+                     errorfile.append( ex.getMessage()+"\n" );
+                    errorfile.close();
+                } catch ( IOException ex1 )
+                {
+                    Logger.getLogger( RdfGenarator.class.getName() ).log( Level.SEVERE, null, ex1 );
+                }
+                   
+            }
+
+        }
+        
+        clFromPageID.clear();
+    }
+    
+    public static void getPagesForCategoryFirstChild( String catName )
+    {
+        ArrayList<Integer> clFromPageID = CategoryLinksDB.getPagesLinkedByCatName( catName );
+        FileWriter outfile;
+
+        for ( int i = 0; i < clFromPageID.size(); i++ )
+        {
+
+            try
+            {
+                Page page = PageDB.getPagebyID( clFromPageID.get( i ) );
+                if ( page.getPageNamespace() == 0 )
+                {
+                    //namespace==0 means it's a article page
+                    outfile = new FileWriter( "/home/kasun/rdfresult/rdfoutput.txt", true );
+                    outfile.append( "<" + page.getPageName() + "> rdf:type <" + promintNodeName + "> \n" );
+                    outfile.close();
+                } 
+                /*
+                else
+                {
+                    if ( page.getPageNamespace() == 14 )
+                    {
+
+                        //namespace==14 means it's a categorypage recurcive the categorypage
+                        getPagesForCategory( page.getPageName() );
+                    }
+                }
+                * 
+                */
+            } catch ( IOException ex )
+            {
+               FileWriter errorfile;
+                try
+                {
+                    errorfile = new FileWriter( "/home/kasun/rdfresult/error.txt", true );
+                     errorfile.append( ex.getMessage()+"\n" );
+                    errorfile.close();
+                } catch ( IOException ex1 )
+                {
+                    Logger.getLogger( RdfGenarator.class.getName() ).log( Level.SEVERE, null, ex1 );
+                }
+                   
+            }
+
+        }
+        
+        clFromPageID.clear();
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/searcher/Search.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/searcher/Search.java
new file mode 100644
index 0000000..044b9d8
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/searcher/Search.java
@@ -0,0 +1,267 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Jul 17, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.searcher;
+
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.LinkedList;
+import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopScoreDocCollector;
+
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.NIOFSDirectory;
+import org.apache.lucene.util.Version;
+import org.apache.lucene.queryparser.classic.ParseException;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class Search
+{
+
+    public static void searchPage( File indexDir, String q, String filed, int hitsPerPage )
+        throws Exception
+    {
+
+        FileWriter outFile;
+        //= new FileWriter("C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0.txt",true);
+
+        WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 );
+        NIOFSDirectory dir = new NIOFSDirectory( indexDir );
+        Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( q );
+
+
+
+        IndexReader reader = IndexReader.open( dir );
+        IndexSearcher searcher = new IndexSearcher( reader );
+        TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true );
+        searcher.search( query, collector );
+        ScoreDoc[] hits = collector.topDocs().scoreDocs;
+
+
+        System.out.println( "Found " + hits.length + " hits." );
+
+
+        for ( int i = 0; i < hits.length; ++i )
+        {
+            outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0.txt", true );
+
+            int docId = hits[i].doc;
+            Document d = searcher.doc( docId );
+            outFile.append( d.get( "page_id" ) + "\t" + d.get( "page_namespace" ) + "\t" + d.get( "page_title" ) + "\n" );
+            //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title"));
+            outFile.close();
+        }
+
+    }
+
+    public static void searchCategoryLinks( File indexDir, String filed, int hitsPerPage )
+        throws Exception
+    {
+
+        WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 );
+        NIOFSDirectory dir = new NIOFSDirectory( indexDir );
+        IndexReader reader = IndexReader.open( dir );
+        IndexSearcher searcher = new IndexSearcher( reader );
+
+
+
+
+        File pageNamespaceResultFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0_new.txt" );
+        String line;
+        BufferedReader fileReader;
+        fileReader = new BufferedReader( new FileReader( pageNamespaceResultFile ) );
+        while ( ( line = fileReader.readLine() ) != null )
+        {
+            if ( !line.isEmpty() )
+            {
+                String[] strArr = line.split( "\\t" );
+                FileWriter outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\categorylinks_match_article_pages.txt", true );
+
+
+                TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true );
+                Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( strArr[0].trim() );
+                searcher.search( query, collector );
+                ScoreDoc[] hits = collector.topDocs().scoreDocs;
+
+
+                System.out.println( strArr[0] + "\t" + hits.length );
+
+
+                for ( int i = 0; i < hits.length; ++i )
+                {
+                    int docId = hits[i].doc;
+                    Document d = searcher.doc( docId );
+                    outFile.append( d.get( "cl_from" ) + "\t" + d.get( "cl_to" ) + "\t" + d.get( "cl_sortkey" ) + d.get( "cl_type" ) + "\n" );
+                    //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title"));
+                }
+                outFile.close();
+            }
+        }
+
+
+    }
+
+    public static void searchCategory( File indexDir, String filed, int hitsPerPage )
+        throws Exception
+    {
+
+        WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 );
+        NIOFSDirectory dir = new NIOFSDirectory( indexDir );
+        IndexReader reader = IndexReader.open( dir );
+        IndexSearcher searcher = new IndexSearcher( reader );
+
+
+
+
+        File uniqueCatFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\sorted_f2_categorylinks_match_article_pages.txt" );
+        String line;
+        BufferedReader fileReader;
+        fileReader = new BufferedReader( new FileReader( uniqueCatFile ) );
+        while ( ( line = fileReader.readLine() ) != null )
+        {
+            if ( !line.isEmpty() )
+            {
+                // String[] strArr = line.split( "\\t" );
+                FileWriter outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\categories_match_article_pages.txt", true );
+
+
+                TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true );
+                Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( "'" + line.trim() + "'" );
+                searcher.search( query, collector );
+                ScoreDoc[] hits = collector.topDocs().scoreDocs;
+
+                if ( hits.length == 0 )
+                {
+                    System.out.println( line );
+                }
+
+                for ( int i = 0; i < hits.length; ++i )
+                {
+                    int docId = hits[i].doc;
+                    Document d = searcher.doc( docId );
+                    outFile.append( d.get( "cat_id" ) + "\t" + d.get( "cat_title" ) + "\n" );
+                    //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title"));
+                }
+                outFile.close();
+            }
+        }
+
+
+    }
+
+    public static ArrayList<String> SearchCatPageLinks( int pageID ) throws IOException, ParseException
+    {
+
+        File indexDir = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\category_page_links_view" );
+        String filed = "page_id";
+        int hitsPerPage = 100;
+
+        ArrayList<String> clToResults = new ArrayList<String>();
+        WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 );
+        NIOFSDirectory dir = new NIOFSDirectory( indexDir );
+        IndexReader reader = IndexReader.open( dir );
+        IndexSearcher searcher = new IndexSearcher( reader );
+
+
+        TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true );
+        Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( "" + pageID + "" );
+        searcher.search( query, collector );
+        ScoreDoc[] hits = collector.topDocs().scoreDocs;
+
+        if ( hits.length == 0 )
+        {
+            System.out.println( pageID );
+        } else
+        {
+         //   System.out.println( hits.length );
+            for ( int i = 0; i < hits.length; ++i )
+            {
+                int docId = hits[i].doc;
+                Document d = searcher.doc( docId );
+                clToResults.add( d.get( "page_title" ) );
+                //  outFile.append( d.get( "cat_id" ) + "\t" + d.get( "cat_title" ) + "\n" );
+                //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title"));
+            }
+        }
+        reader.close();
+        dir.close();
+        return clToResults;
+    }
+    
+        public static LinkedList<Integer> SearchCategoryPages( String pageTitle ) throws IOException, ParseException
+    {
+
+        File indexDir = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\categoty_page_candidate_index" );
+        String filed = "page_title";
+        int hitsPerPage = 5;
+        
+       
+
+        LinkedList<Integer> clToResults = new LinkedList<Integer>();
+        WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 );
+        NIOFSDirectory dir = new NIOFSDirectory( indexDir );
+        IndexReader reader = IndexReader.open( dir );
+        IndexSearcher searcher = new IndexSearcher( reader );
+        FileWriter outFile2;
+ try{
+
+        TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true );
+        Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( pageTitle );
+        searcher.search( query, collector );
+        ScoreDoc[] hits = collector.topDocs().scoreDocs;
+        
+         if ( hits.length == 0 )
+        {
+            System.out.println( pageTitle );
+        } else
+        {
+          //  System.out.println( hits.length );
+            for ( int i = 0; i < hits.length; ++i )
+            {
+                int docId = hits[i].doc;
+                Document d = searcher.doc( docId );
+                clToResults.add( Integer.valueOf( d.get( "page_id" )) );
+                //  outFile.append( d.get( "cat_id" ) + "\t" + d.get( "cat_title" ) + "\n" );
+                //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title"));
+            }
+        }
+        
+         return clToResults;
+        }
+        catch (ParseException e){
+             outFile2 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\pages_can't_parse.txt", true );
+                        outFile2.append(pageTitle + "\n" );
+                        outFile2.close();
+            
+           // System.out.println("Can't parse"+ pageTitle);
+            
+             return clToResults;
+        }
+       finally{
+     
+      reader.close();
+      dir.close();
+ }
+       
+    }
+    
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/ReadXMLFile.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/ReadXMLFile.java
new file mode 100644
index 0000000..786b258
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/ReadXMLFile.java
@@ -0,0 +1,134 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Sep 10, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.wikiquery;
+
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.DocumentBuilder;
+import org.w3c.dom.Document;
+import org.w3c.dom.NodeList;
+import org.w3c.dom.Node;
+import org.w3c.dom.Element;
+import java.io.File;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+
+public class ReadXMLFile
+{
+
+    public static void ReadFile( String filename )
+    {
+        //public static void ReadFile(File fXmlFile) {
+        try
+        {
+
+            File fXmlFile = new File( filename );
+            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
+            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
+            Document doc = dBuilder.parse( fXmlFile );
+
+            //optional, but recommended
+            //read this - http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work
+            doc.getDocumentElement().normalize();
+
+            System.out.println( "Root element :" + doc.getDocumentElement().getNodeName() );
+
+            NodeList nList = doc.getElementsByTagName( "rev" );
+
+            System.out.println( "----------------------------" );
+
+            for ( int temp = 0; temp < nList.getLength(); temp++ )
+            {
+
+                Node nNode = nList.item( temp );
+
+                System.out.println( "Current Element :" + nNode.getNodeName() );
+
+                if ( nNode.getNodeType() == Node.ELEMENT_NODE )
+                {
+
+
+                    Element eElement = (Element) nNode;
+
+                    System.out.println( "Revision22222 id : " + eElement.getAttribute( "revid" ) );
+//			System.out.println("First Name : " + eElement.getElementsByTagName("firstname").item(0).getTextContent());
+//			System.out.println("Last Name : " + eElement.getElementsByTagName("lastname").item(0).getTextContent());
+//			System.out.println("Nick Name : " + eElement.getElementsByTagName("nickname").item(0).getTextContent());
+//			System.out.println("Salary : " + eElement.getElementsByTagName("salary").item(0).getTextContent());
+
+                }
+            }
+        } catch ( Exception e )
+        {
+            e.printStackTrace();
+        }
+    }
+
+    public static int ReadFile( Document doc ,String urlParameters, String url) throws UnsupportedEncodingException
+    {
+
+          int numberOfRevisions=0;  
+        //public static void ReadFile(File fXmlFile) {
+        try
+        {
+            doc.getDocumentElement().normalize();
+
+          //  System.out.println( "Root element :" + doc.getDocumentElement().getNodeName() );
+
+            NodeList continueNodeList = doc.getElementsByTagName( "revisions" );
+            if ( continueNodeList.getLength() > 0 )
+            {
+                Node continueNode = continueNodeList.item( 0 );
+
+                Element continueElement = (Element) continueNode;
+              //  String urlParameters = "fName=" + URLEncoder.encode( "???", "UTF-8" ) + "&lName=" + URLEncoder.encode( "???", "UTF-8" );
+       // String url = "http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&titles=Mother&rvlimit=max&rvstart=20130604000000&rvcontinue="+continueElement.getAttribute( "rvcontinue" );
+      
+                //  System.out.println("Calling recursive function using rivision Id "+ continueElement.getAttribute( "rvcontinue" ));
+                numberOfRevisions=ReadFile(RevisionHistory.excutePost( url+ "&rvcontinue="+continueElement.getAttribute( "rvcontinue" ), urlParameters ),urlParameters, url );
+              
+              //  System.out.println( "Continue revision Id : " + continueElement.getAttribute( "rvcontinue" ) );
+            }
+
+            NodeList nList = doc.getElementsByTagName( "rev" );
+
+          //  System.out.println( "number of nodes" + nList.getLength());
+/*
+            for ( int temp = 0; temp < nList.getLength(); temp++ )
+            {
+
+                Node nNode = nList.item( temp );
+
+             //   System.out.println( "\nCurrent Element :" + nNode.getNodeName() + " count: " + temp );
+
+                if ( nNode.getNodeType() == Node.ELEMENT_NODE )
+                {
+
+                    Element eElement = (Element) nNode;
+
+                    System.out.println( "Revision id : " + eElement.getAttribute( "revid" ) );
+
+                }
+            }
+            */
+            
+            return numberOfRevisions+ nList.getLength();
+        } catch ( Exception e )
+        {
+            e.printStackTrace();
+            return 0;
+            
+        }
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/RevisionHistory.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/RevisionHistory.java
new file mode 100644
index 0000000..58bdca9
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/RevisionHistory.java
@@ -0,0 +1,110 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Sep 10, 2013 Kasun Perera Created
+ *
+ */
+package org.dbpedia.kasun.wikiquery;
+
+
+import java.io.*;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import org.w3c.dom.Document;
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class RevisionHistory
+{
+
+   // public static String excutePost( String targetURL, String urlParameters )
+         public static Document excutePost( String targetURL, String urlParameters )
+    {
+        URL url;
+        HttpURLConnection connection = null;
+        try
+        {
+            //Create connection
+            url = new URL( targetURL );
+            connection = (HttpURLConnection) url.openConnection();
+            connection.setRequestMethod( "GET" );
+            connection.setRequestProperty( "Accept", "application/xml" );
+
+            //connection.setRequestProperty( "Content-Length", ""+ Integer.toString( urlParameters.getBytes().length ) );
+           // connection.setRequestProperty( "Content-Language", "en-US" );
+
+            connection.setUseCaches( false );
+            connection.setDoInput( true );
+            connection.setDoOutput( true );
+
+            //Send request
+            DataOutputStream wr = new DataOutputStream(
+                connection.getOutputStream() );
+            wr.writeBytes( urlParameters );
+            wr.flush();
+            wr.close();
+
+            //Get Response	
+            InputStream is = connection.getInputStream();
+            
+            DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+DocumentBuilder db = dbf.newDocumentBuilder();
+Document doc = (Document) db.parse(is);
+
+
+/*
+            BufferedReader rd = new BufferedReader( new InputStreamReader( is ) );
+            String line;
+
+           
+                // Create temp file.
+                File temp = File.createTempFile( "pattern", ".xml" );
+
+                // Delete temp file when program exits.
+                temp.deleteOnExit();
+
+                // Write to temp file
+                BufferedWriter out = new BufferedWriter( new FileWriter( temp ) );
+               
+            
+
+            StringBuffer response = new StringBuffer();
+            while ( ( line = rd.readLine() ) != null )
+            {
+                 out.write( line + "\n" );
+               
+                System.out.println( line + "\n" );
+                response.append( line + "\n" );
+                //  response.append( '\r' );
+            }
+            rd.close();
+             out.close();
+             
+             
+             */
+              return doc;
+          //  return response.toString();
+
+        } catch ( Exception e )
+        {
+
+            e.printStackTrace();
+            return null;
+
+        } finally
+        {
+
+            if ( connection != null )
+            {
+                connection.disconnect();
+            }
+        }
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/WikiQuery.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/WikiQuery.java
new file mode 100644
index 0000000..68841c6
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/WikiQuery.java
@@ -0,0 +1,49 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+
+
+
+/** 
+ *   KarshaAnnotate- Annotation tool for financial documents
+ *  
+ * 
+ *      Date             Author          Changes 
+ *      Sep 10, 2013     Kasun Perera    Created   
+ * 
+ */ 
+
+package org.dbpedia.kasun.wikiquery;
+
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.URLEncoder;
+
+
+
+/**
+ * TODO- describe the  purpose  of  the  class
+ * 
+ */
+public class WikiQuery {
+    
+    public static void main(String[] args ) throws UnsupportedEncodingException {
+        
+        int pageId=83430;
+        
+        String urlParameters = "fName=" + URLEncoder.encode("???", "UTF-8") + "&lName=" + URLEncoder.encode("???", "UTF-8");
+        //timestamp June 4th, 2013 00:00:00 UTC=20130604000000
+       // String url="http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&titles=Mother&rvlimit=max&rvstart=20130604000000";
+   String url="http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&pageids="+pageId+"&rvlimit=max&rvstart=20130604000000";
+ 
+        //pageid
+        // RevisionHistory.excutePost( url, urlParameters );
+      //  ReadXMLFile.ReadFile( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\WikipediaCategoryProcessor\\api.xml");
+    int totalRevisions= ReadXMLFile.ReadFile(RevisionHistory.excutePost( url, urlParameters ),urlParameters,url);
+    System.out.println("totalRevisions "+ totalRevisions);
+    }
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java
new file mode 100644
index 0000000..d59d5b5
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java
@@ -0,0 +1,417 @@
+package org.yago.javatools.administrative;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.Writer;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Map;
+
+import org.yago.javatools.parsers.Char;
+
+/** 
+ This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+ It is licensed under the Creative Commons Attribution License 
+ (see http://creativecommons.org/licenses/by/3.0) by 
+ the YAGO-NAGA team (see http://mpii.de/yago-naga).
+ 
+ 
+ 
+
+ 
+ This class provides convenience methods for Input/Output.
+ Allows to do basic I/O with easy procedure calls
+ -- nearly like in normal programming languages.
+ Furthermore, the class provides basic set operations for EnumSets, NULL-safe
+ comparisons and adding to maps.<BR>
+ Example:
+ <PRE>
+ D.p("This is an easy way to write a string");
+ // And this is an easy way to read one:
+ String s=D.r();
+ 
+ // Here is a cool way to print something inline
+ computeProduct(factor1,(Integer)D.p(factor2));
+ 
+ // Here are some tricks with enums
+ enum T {a,b,c};
+ EnumSet&lt;T> i=D.intersection(EnumSet.of(T.a,T.b),EnumSet.of(T.b,T.c));
+ EnumSet&lt;T> u=D.union(EnumSet.of(T.a,T.b),EnumSet.of(T.b,T.c));
+ 
+ // Here is how to compare things, even if they are NULL
+ D.compare(object1, object2);
+ 
+ // Here is how to add something to maps that contain lists
+ Map&lt;String,List&lt;String>> string2list=new TreeMap&lt;String,List&lt;String>>();
+ D.addKeyValue(string2list,"key","new list element",ArrayList.class); 
+ // now, the map contains "key" -> [ "new list element" ]
+ D.addKeyValue(string2list,"key","again a new list element",ArrayList.class);
+ // now, the map contains "key" -> [ "new list element", "again a new list element" ]  
+
+ // Here is how to add something to maps that contain integers
+ Map&lt;String,Integer> string2list=new TreeMap&lt;String,Integer>();
+ D.addKeyValue(string2list,"key",7); // map now contains "key" -> 7
+ D.addKeyValue(string2list,"key",3); // map now contains "key" -> 10
+
+ </PRE>  
+ */
+public class D {
+
+  /** Indentation margin. All methods indent their output by indent spaces */
+  public static int indent = 0;
+
+  /** Prints <indent> spaces */
+  protected static void i() {
+    for (int i = 0; i < indent; i++)
+      System.out.print(" ");
+  }
+
+  /** Prints some Objects, returns them */
+  public static Object p(Object... a) {
+    pl(a);
+    if (a == null || a.length == 0) return (null);
+    if (a.length == 1) return (a[0]);
+    return (a);
+  }
+
+  /** Prints some Objects */
+  public static Object println(Object... a) {
+    return (p(a));
+  }
+
+
+  
+  /** Prints some Objects on one line */
+  public static void pl(Object... a) { 
+      //System.out.print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@");
+    System.out.print(toString(a));
+  }
+
+  /** Prints an array of integers*/
+  public static int[] p(int[] a) {
+    i();
+    if (a == null) System.out.print("null-array");
+    else for (int i = 0; i < a.length; i++)
+      System.out.print(a[i] + ", ");
+    System.out.println("");
+    return (a);
+  }
+
+  /** Prints an array of doubles*/
+  public static double[] p(double[] a) {
+    i();
+    if (a == null) System.out.print("null-array");
+    else for (int i = 0; i < a.length; i++)
+      System.out.print(a[i] + ", ");
+    System.out.println("");
+    return (a);
+  }
+
+  /** Reads a line from the keyboard */
+  public static String r() {
+    String s = "";
+    i();
+    try {
+      s = new BufferedReader(new InputStreamReader(System.in)).readLine();
+    } catch (Exception whocares) {
+    }
+    return (s);
+  }
+
+  /** Reads a line from the keyboard */
+  public static String read() {
+    return (r());
+  }
+
+  /** Reads a long from the keyboard */
+  public static String read(String question) {
+    System.out.print(question+" ");
+    return (D.read());
+  }
+
+  /** Reads a long from the keyboard */
+  public static boolean readBoolean(String question) {
+    System.out.print(question+" ");
+    return (D.read().startsWith("y"));
+  }
+
+  /** Reads a long from the keyboard */
+  public static long readLong(String question) {
+    System.out.print(question);
+    return (Long.parseLong(D.r()));
+  }
+
+  /** Reads a double from the keyboard */
+  public static double readDouble(String question) {
+    System.out.print(question);
+    return (Double.parseDouble(D.r()));
+  }
+
+  /** Waits for a number of milliseconds */
+  public static void waitMS(long milliseconds) {
+    try {
+      Thread.sleep(milliseconds);
+    } catch (InterruptedException ex) {
+    }
+  }
+
+  /** Returns the intersection of two enumsets */
+  public static <E extends Enum<E>> EnumSet<E> intersection(EnumSet<E> s1, EnumSet<E> s2) {
+    // We have to clone, since retainAll modifies the set
+    EnumSet<E> s = s1.clone();
+    s.retainAll(s2);
+    // I tried coding this for arbitrary sets, but it failed because
+    // the interface Cloneable does not make sure that the clone-method
+    // is visible (!)
+    return (s);
+  }
+
+  /** Returns the union of two enumsets */
+  public static <E extends Enum<E>> EnumSet<E> union(EnumSet<E> s1, EnumSet<E> s2) {
+    EnumSet<E> s = s1.clone();
+    s.addAll(s2);
+    return (s);
+  }
+
+  /** Tells whether the intersection is non-empty */
+  public static <E extends Enum<E>> boolean containsOneOf(EnumSet<E> s1, EnumSet<E> s2) {
+    return (!intersection(s1, s2).isEmpty());
+  }
+
+  /** Exits with error code 0 */
+  public static void exit() {
+    System.exit(0);
+  }
+
+  /** Writes a line to a writer. Yes, this is possible */
+  public static void writeln(Writer out, Object s) throws IOException {
+    out.write(s.toString());
+    out.write("\n");
+  }
+
+  /** Writes a line to a writer. Yes, this is possible */
+  public static void writeln(OutputStream out, Object s) throws IOException {
+    String string = Char.encodeUTF8(s.toString());
+    for (int i = 0; i < string.length(); i++)
+      out.write(string.charAt(i));
+    out.write('\n');
+  }
+
+  /** Writes a line silently to a writer. */
+  public static void silentWriteln(Writer out, Object s) {
+    try {
+      out.write(s.toString());
+      out.write("\n");
+    } catch (Exception e) {
+    }
+  }
+
+  /** Executes a command */
+  public static void execute(String cmd, File folder) throws Exception {
+    Process p = Runtime.getRuntime().exec(cmd, null, folder);
+    BufferedReader bri = new BufferedReader(new InputStreamReader(p.getInputStream()));
+    BufferedReader bre = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+    String s1, s2 = null;
+    while (null != (s1 = bri.readLine()) || null != (s2 = bre.readLine())) {
+      if (s1 != null) System.out.println(s1);
+      if (s2 != null) System.err.println(s2);
+    }
+    p.waitFor();
+  }
+
+  /** Given a map that maps to collections, adds a new key/value pair or introduces the key*/
+  @SuppressWarnings({ "unchecked", "rawtypes" })
+  public static <K, V, C extends Collection<V>, L extends Collection> void addKeyValue(Map<K, C> map, K key, V value, Class<L> collectionType) {
+    C coll = map.get(key);
+    if (coll == null) {
+      try {
+        map.put(key, coll = (C) collectionType.newInstance());
+      } catch (Exception e) {
+        throw new RuntimeException(e);
+      }
+    }
+    coll.add(value);
+  }
+
+  /** Given a map that maps to collections, adds a new key/value pair or introduces the key*/
+  @SuppressWarnings({ "rawtypes" })
+  public static <K, V, C extends Collection<V>, L extends Collection> void addKeyValues(Map<K, C> map, K key, C values, Class<L> collectionType) {
+    for(V val : values) addKeyValue(map,key,val,collectionType);
+  }
+  
+  /** Given a map that maps to integers, adds a new key/value pair or increases the counter*/
+  public static <K> void addKeyValue(Map<K, Integer> map, K key, int value) {
+    Integer coll = map.get(key);
+    if (coll == null) {
+      map.put(key, value);
+      return;
+
+    }
+    map.put(key, coll + value);
+  }
+
+  /** Given a map that maps to floats, adds a new key/value pair or increases the counter*/
+  public static <K> void addKeyValueFlt(Map<K, Float> map, K key, float value) {
+    Float coll = map.get(key);
+    if (coll == null) {
+      map.put(key, value);
+      return;
+    }
+    map.put(key, coll + value);    
+  }
+  
+  /** Given a map that maps to doubles, adds a new key/value pair or increases the counter*/
+  public static <K> void addKeyValueDbl(Map<K, Double> map, K key, double value) {
+    Double coll = map.get(key);
+    if (coll == null) {
+      map.put(key, value);
+      return;
+
+    }
+    map.put(key, coll + value);    
+  }
+  
+  /** Given a map that maps to comparable objects, sets a key to a given value iff the current value is null or smaller than the given value*/
+  public static <K ,V extends Comparable<V>> void setKeyValueIfGreaterThanCurrent(Map<K, V> map, K key, V value) {
+    V coll = map.get(key);
+    if (coll == null) {
+      map.put(key, value);
+      return;
+    }
+    if(coll.compareTo(value)<0)
+      map.put(key, value);    
+  }
+  
+
+  /** Returns the element of a map or 0*/
+  public static <K> int getOrZero(Map<K, Integer> map, K key) {
+    Integer i = map.get(key);
+    if (i == null) return (0);
+    return (i);
+  }
+
+  /** Returns the element of a map or 0*/
+  public static <K> double getOrZeroDouble(Map<K, Double> map, K key) {
+    Double i = map.get(key);
+    if (i == null) return (0);
+    return (i);
+  }
+  
+  /** Returns the element of a map or a default value*/
+  public static <K,V> V getOr(Map<K, V> map, K key, V defValue) {
+    V i = map.get(key);
+    if (i == null) return defValue;
+    return (i);
+  }
+
+  /** Returns a sorted list of the items*/
+  public static<T> List<T> sorted(final Map<T, Integer> map) {
+    List<T> list=new ArrayList<T>(map.keySet());
+    Collections.sort(list,new Comparator<T>(){
+
+      @Override
+      public int compare(T arg0, T arg1) {
+        return (map.get(arg1).compareTo(map.get(arg0)));
+      }});
+    return(list);
+  }
+  
+  /** Returns a sorted list of the items*/
+  public static<T> List<T> sortedDouble(final Map<T, Double> map) {
+    List<T> list=new ArrayList<T>(map.keySet());
+    Collections.sort(list,new Comparator<T>(){
+
+      @Override
+      public int compare(T arg0, T arg1) {
+        return (map.get(arg1).compareTo(map.get(arg0)));
+      }});
+    return(list);
+  }
+  
+  /** Returns true if two things are equal, including NULL */
+  public static <E> boolean equal(E s1, E s2) {
+    if (s1 == s2) return (true);
+    if (s1 == null) return (false);
+    if (s2 == null) return (false);
+    return (s1.equals(s2));
+  }
+
+  /** Compares two things, including NULL */
+  public static <E extends Comparable<E>> int compare(E s1, E s2) {
+    if (s1 == s2) return (0);
+    if (s1 == null) return (-1);
+    if (s2 == null) return (1);
+    return (s1.compareTo(s2));
+  }
+
+  /** Compares pairs of comparable things (a1,a2,b1,b2,...), including NULL */
+  @SuppressWarnings("unchecked")
+  public static int comparePairs(Object... o) {
+    for (int i = 0; i < o.length; i += 2) {
+      int c = compare((Comparable) o[i], (Comparable) o[i + 1]);
+      if (c != 0) return (c);
+    }
+    return (0);
+  }
+
+  /** Compares pairs of comparable things (a1,a2,b1,b2,...) for equality, including NULL */
+  public static boolean equalPairs(Object... o) {
+    for (int i = 0; i < o.length; i += 2) {
+      if (!equal(o[i], o[i + 1])) return (false);
+    }
+    return (true);
+  }
+
+  /** Returns the index of a thing in an array or -1*/
+  public static int indexOf(Object o, Object... os) {
+    for (int i = 0; i < os.length; i++) {
+      if (D.equal(os[i], o)) return (i);
+    }
+    return (-1);
+  }
+
+  /** TRUE if the first enum is before the second*/
+  public static <C extends Enum<C>> boolean smaller(Enum<C> e1, Enum<C> e2) {
+    return (e1.ordinal() < e2.ordinal());
+  }
+
+  /** Returns a reasonable String representation of a sequence of things. Handles arrays, deep arrays and NULL.*/
+  public static String toString(Object... o) {
+    if (o == null) {
+      return ("null");
+    }
+    StringBuilder b = new StringBuilder();
+    for (int i = 0; i < o.length; i++) {
+      if (o[i] == null) {
+        b.append("null");
+        continue;
+      }
+      if (o[i].getClass().isArray()) {
+        b.append("[");
+        if (((Object[]) o[i]).length != 0) {
+          for (Object obj : (Object[]) o[i]) {
+            b.append(toString(obj)).append(", ");
+          }
+        }
+        b.append("]");
+      } else {
+        b.append(o[i].toString());
+      }
+      if (i != o.length - 1) b.append(" ");
+    }
+    return (b.toString());
+  }
+  
+  /** Picks one element from a set or NULL*/
+  public static <T> T pick(Collection<T> set) {
+    if(set.isEmpty()) return(null);
+    return(set.iterator().next());
+  }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/Elements.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/Elements.java
new file mode 100644
index 0000000..79947de
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/Elements.java
@@ -0,0 +1,64 @@
+/*
+ * To change this template, choose Tools | Templates
+ * and open the template in the editor.
+ */
+/**
+ *
+ * Date Author Changes Sep 4, 2013 Kasun Perera Created
+ *
+ */
+package org.yago.javatools.administrative;
+
+
+import org.yago.javatools.parsers.NounGroup;
+
+/**
+ * TODO- describe the purpose of the class
+ *
+ */
+public class Elements
+{
+
+    public static void main( String[] args ) throws Exception
+    {
+
+        System.out.println( getHead( "booooooooo" ) );
+
+    }
+
+    public static String getHead( String category )
+    {
+
+        String elementList[] = splitObject( new NounGroup( category ).description() );
+        if ( elementList == null || elementList.length == 0 )
+        {
+            return ( null );
+        }
+        /*
+         * lelemnts of the elementList 
+         * [0]"NounGroup:
+         * [1]Original: "+original+"
+         * [2]Stemmed: "+stemmed()+"
+         * [3]Determiner: "+determiner+"
+         * [4]preModifiers: "+preModifier+"
+         * [5]Head: "+head+"
+         * [6]Adjective:"+adjective+"
+         * [7]Preposition: "+preposition+"
+         * [8]postModifier:\n"+(postModifier==null?"":postModifier.description()));
+         * 
+         */
+        String head[] = elementList[5].split( ":" );
+        if(head.length<1){
+            return (null);
+        }
+        
+        return (head[1].trim());
+    }
+
+    public static String[] splitObject( Object... a )
+    {
+        String objectlist[] = D.toString( a ).split( "\\n" );
+
+        return objectlist;
+    }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalMap.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalMap.java
new file mode 100644
index 0000000..c13c10e
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalMap.java
@@ -0,0 +1,41 @@
+package org.yago.javatools.datatypes;
+import java.util.TreeMap;
+
+/** 
+This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+It is licensed under the Creative Commons Attribution License 
+(see http://creativecommons.org/licenses/by/3.0) by 
+the YAGO-NAGA team (see http://mpii.de/yago-naga).
+  
+
+  
+ 
+
+Provides a nicer constructor for a TreeMap. 
+Example:
+<PRE>
+   FinalMap<String,Integer> f=new FinalMap(
+     "a",1,
+     "b",2,
+     "c",3);
+   System.out.println(f.get("b"));
+   --> 2
+</PRE>
+*/
+public class FinalMap<T1 extends Comparable,T2> extends TreeMap<T1,T2>{
+  /** Constructs a FinalMap from an array that contains key/value sequences */  
+  @SuppressWarnings("unchecked")
+  public FinalMap(Object... a) {
+    super();    
+    for(int i=0;i<a.length-1;i+=2) {
+      if(containsKey((T1)a[i])) throw new RuntimeException("Duplicate key in FinalMap: "+a[i]);
+      put((T1)a[i],(T2)a[i+1]);
+    }
+  }
+  
+  /** Test routine */
+  public static void main(String[] args) {
+    FinalMap<String,Integer> f=new FinalMap<String,Integer>("a",1,"b",2);
+    System.out.println(f.get("b"));
+  }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalSet.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalSet.java
new file mode 100644
index 0000000..0a27044
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalSet.java
@@ -0,0 +1,73 @@
+package org.yago.javatools.datatypes;
+import java.util.AbstractList;
+import java.util.Arrays;
+import java.util.Set;
+
+import org.yago.javatools.administrative.D;
+/** 
+This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+It is licensed under the Creative Commons Attribution License 
+(see http://creativecommons.org/licenses/by/3.0) by 
+the YAGO-NAGA team (see http://mpii.de/yago-naga).
+  
+
+  
+ 
+
+This class provides a very simple container implementation with zero overhead.
+A FinalSet bases on a sorted, unmodifiable array. The constructor
+can either be called with a sorted unmodifiable array (default constructor)
+or with an array that can be cloned and sorted beforehand if desired. 
+Example:
+<PRE>
+   FinalSet<String> f=new FinalSet("a","b","c");
+   // equivalently: 
+   //   FinalSet<String> f=new FinalSet(new String[]{"a","b","c"});
+   //   FinalSet<String> f=new FinalSet(SHALLNOTBECLONED,ISSORTED,"a","b","c");
+   System.out.println(f.get(1));
+   --> b
+</PRE>
+*/
+public class FinalSet<T extends Comparable> extends AbstractList<T> implements Set<T>{
+  /** Holds the data, must be sorted */
+  public T[] data;  
+  /** Constructs a FinalSet from an array, clones and sorts the array if indicated. */
+  @SuppressWarnings("unchecked")
+  public FinalSet(boolean clone,T... a) {
+    if(clone) {
+      Comparable[] b=new Comparable[a.length];
+      System.arraycopy(a,0,b,0,a.length);
+      a=(T[])b;
+    }
+    Arrays.sort(a);
+    data=a;
+  }
+  /** Constructs a FinalSet from an array that does not need to be cloned */
+  public FinalSet(T... a) {
+    this(false,a);
+  }
+  /** Tells whether x is in the container */
+  public boolean contains(T x) {
+    return(Arrays.binarySearch(data,x)>=0);
+  }
+  /** Returns the position in the array or -1 */
+  public int indexOf(T x) {
+    int r=Arrays.binarySearch(data,x);
+    return(r>=0?r:-1);
+  }
+  /** Returns the element at position i*/
+  public T get(int i) {
+    return(data[i]);
+  }
+  
+  /** Returns the number of elements in this FinalSet */
+  public int size() {
+    return(data.length);
+  }
+  
+  /** Test routine */
+  public static void main(String[] args) {
+    FinalSet<String> f=new FinalSet<String>("b","a","c");
+    D.p(f.get(1));
+  }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/Char.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/Char.java
new file mode 100644
index 0000000..0abf63a
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/Char.java
@@ -0,0 +1,1404 @@
+package org.yago.javatools.parsers;
+
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.Map;
+import java.util.TreeMap;
+
+import org.yago.javatools.datatypes.FinalMap;
+
+/** 
+This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+It is licensed under the Creative Commons Attribution License 
+(see http://creativecommons.org/licenses/by/3.0) by 
+the YAGO-NAGA team (see http://mpii.de/yago-naga).
+  
+
+  
+
+  This class provides static methods to <I>decode, encode</I> and <I>normalize</I> Strings.<BR>
+  <B>Decoding</B> converts the following codes to Java 16-bit characters (<TT>char</TT>):
+  <UL>
+   <LI>all HTML ampersand codes (like &amp;nbsp;) as specified by the W3C
+   <LI>all backslash codes (like \ b) as specified by the Java language specification
+   <LI>all percentage codes (like %2C) as used in URLs and E-Mails
+   <LI>all UTF-8 codes (like ī) as specified in Wikipedia
+  </UL>
+  <P>
+  <B>Encoding</B> is the inverse operation. It takes a Java 16-bit character (<TT>char</TT>) and
+  outputs its encoding in HTML, as a backslash code, as a percentage code or in UTF8.
+  <P>
+  <B>Normalization</B> converts the following Unicode characters (Java 16-bit <TT>char</TT>s)
+  to ASCII-characters in the range 0x20-0x7F:
+  <UL>
+   <LI>all ASCII control characters (0x00-0x1F)
+   <LI>all Latin-1 characters (0x80-0xFF) to the closest transliteration
+   <LI>all Latin Extended-A characters (0x100-0x17F) to the closest transliteration
+   <LI>all Greek characters (0x374-0x3D6) to the closest transliteration as specified in Wikipedia
+   <LI>all General-Punctuation characters (0x2016-0x2055) to the closest ASCII punctuation
+   <LI>most mathematical symbols (in the range of 0x2000) to the common program code identifier or text
+   <LI>all ligatures (0xFB00-0xFB06, the nasty things you get when you copy/paste from PDFs) to
+       the separate characters
+  </UL>
+  <P>
+  <CENTER><H3>Usage</H3></CENTER>
+  <P>
+  <B>Decoding</B> is done by methods that "eat" a code from the string.
+  They require as an additional parameter an integer array of length 1,
+  in which they store the length of the code that they chopped off.<BR>
+  Example:
+  <PRE>
+     int[] eatLength=new int[1];
+     char c=eatPercentage("%2Cblah blah",eatLength);
+     -->  c=','
+          eatLength[0]=3  // the code was 3 characters long
+  </PRE>
+  There is a static integer array Char.eatLength, which you can use for this purpose.
+  The methods store 0 in case the String does not start with the correct code.
+  They store -1 in case the String starts with a corrupted code. Of course, you can
+  use the <TT>eat...</TT> methods also to decode one single code. There are methods
+  <TT>decode...</TT> that decode the percentage code, the UTF8-codes, the backslash codes
+  or the Ampersand codes, respectively. 
+  The method <TT>decode(String)</TT> decodes all codes of a String.<BR>
+  Example:
+  <PRE>
+     decode("This String contains some codes: &amp;amp; %2C \ u0041");
+     --> "This String contains some codes: &amp; , A"
+  </PRE>
+  <P>
+  <B>Normalization</B> is done by the method <TT>normalize(int c)</TT>. It converts a Unicode 
+  character (a 16-bit Java character <TT>char</TT>)
+  to a sequence of normal characters (i.e. characters in the range 0x20-0x7F). 
+  The transliteration may consist of multiple chars (e.g. for umlauts) and also of no
+  chars at all (e.g. for Unicode Zero-Space-Characters). <BR>
+  Example:
+  <PRE>
+    normalize('&auml;');
+    --> "ae"
+  </PRE>  
+  The method <TT>normalize(String)</TT>  normalizes all characters in a String.<BR>
+  Example:
+  <PRE>
+     normalize("This String contains the umlauts �, � and �");
+     -->  "This String contains the umlauts Ae, Oe and Ue"
+  </PRE>
+  If the method cannot find a normalization, it calls defaultNormalizer.apply(char c).
+  Decoding and normalizing can be combined by the method decodeAndNormalize(String s).
+  <P>
+  <B>Encoding</B> is done by methods called <TT>encode...(char)</TT>. These methods take a character
+  and transform it to a UTF8 code, a percentage code, an ampersand code or a backslash code,
+  respectively. If the character is normal (i.e. in the range 0x20-0x7F), they simply return the input
+  character without any change.<BR>
+  Example:
+  <PRE>
+     encodePercentage('�');
+     -->  "%C4"
+  </PRE>  
+  There are also methods that work on entire Strings<BR>
+  Example:
+  <PRE>
+     encodePercentage("This String contains the umlauts �, � and �");
+     -->  "This String contains the umlauts %C4, %D6 and %DC;"
+  </PRE>  
+  <P>
+  Last, this class provides the character categorization for URIs, as given in
+  http://tools.ietf.org/html/rfc3986 . It also provides a method to encode only those
+  characters that are not valid path component characters<BR>
+  Example:
+  <PRE>
+     isReserved(';');
+     -->  true
+     encodeURIPathComponent("a: b")
+     -->  "a:%20b"
+  </PRE>    
+ */
+public class Char {
+
+  /** Defines just one function from an int to a String */
+  public interface Char2StringFn {
+
+    /** Function from a char to a String */
+    String apply(char c);
+  }
+
+  /** Called by normalize(int) in case the character cannot be normalized.
+   * The default implementation returns UNKNOWN.
+   * Feel free to create a new Char2StringFn and assign it to defaultNormalizer. */
+  public static Char2StringFn defaultNormalizer = new Char2StringFn() {
+
+    public String apply(char c) {
+      return (UNKNOWN);
+    }
+  };
+
+  /** String returned by the default implementation of defaultNormalizer, "[?]"*/
+  public static String UNKNOWN = "[?]";
+
+  /** Maps a special character to a HTML ampersand sequence */
+  public static Map<Character, String> charToAmpersand = new FinalMap<Character, String>('&', "&amp;", '\'', "&apos;", '<', "&lt;", '>', "&gt;", '"', "&quot;");
+
+  /** Maps HTML ampersand sequences to strings */
+  public static Map<String, Character> ampersandMap = new FinalMap<String, Character>("nbsp", (char) 160, "iexcl", (char) 161, "cent", (char) 162, "pound", (char) 163, "curren", (char) 164, "yen", (char) 165, "brvbar", (char) 166, "sect",
+      (char) 167, "uml", (char) 168, "copy", (char) 169, "ordf", (char) 170, "laquo", (char) 171, "not", (char) 172, "shy", (char) 173, "reg", (char) 174, "macr", (char) 175, "deg", (char) 176, "plusmn", (char) 177, "sup2", (char) 178, "sup3",
+      (char) 179, "acute", (char) 180, "micro", (char) 181, "para", (char) 182, "middot", (char) 183, "cedil", (char) 184, "sup1", (char) 185, "ordm", (char) 186, "raquo", (char) 187, "frac14", (char) 188, "frac12", (char) 189, "frac34", (char) 190,
+      "iquest", (char) 191, "Agrave", (char) 192, "Aacute", (char) 193, "Acirc", (char) 194, "Atilde", (char) 195, "Auml", (char) 196, "Aring", (char) 197, "AElig", (char) 198, "Ccedil", (char) 199, "Egrave", (char) 200, "Eacute", (char) 201,
+      "Ecirc", (char) 202, "Euml", (char) 203, "Igrave", (char) 204, "Iacute", (char) 205, "Icirc", (char) 206, "Iuml", (char) 207, "ETH", (char) 208, "Ntilde", (char) 209, "Ograve", (char) 210, "Oacute", (char) 211, "Ocirc", (char) 212, "Otilde",
+      (char) 213, "Ouml", (char) 214, "times", (char) 215, "Oslash", (char) 216, "Ugrave", (char) 217, "Uacute", (char) 218, "Ucirc", (char) 219, "Uuml", (char) 220, "Yacute", (char) 221, "THORN", (char) 222, "szlig", (char) 223, "agrave",
+      (char) 224, "aacute", (char) 225, "acirc", (char) 226, "atilde", (char) 227, "auml", (char) 228, "aring", (char) 229, "aelig", (char) 230, "ccedil", (char) 231, "egrave", (char) 232, "eacute", (char) 233, "ecirc", (char) 234, "euml",
+      (char) 235, "igrave", (char) 236, "iacute", (char) 237, "icirc", (char) 238, "iuml", (char) 239, "eth", (char) 240, "ntilde", (char) 241, "ograve", (char) 242, "oacute", (char) 243, "ocirc", (char) 244, "otilde", (char) 245, "ouml",
+      (char) 246, "divide", (char) 247, "oslash", (char) 248, "ugrave", (char) 249, "uacute", (char) 250, "ucirc", (char) 251, "uuml", (char) 252, "yacute", (char) 253, "thorn", (char) 254, "yuml", (char) 255, "fnof", (char) 402, "Alpha",
+      (char) 913, "Beta", (char) 914, "Gamma", (char) 915, "Delta", (char) 916, "Epsilon", (char) 917, "Zeta", (char) 918, "Eta", (char) 919, "Theta", (char) 920, "Iota", (char) 921, "Kappa", (char) 922, "Lambda", (char) 923, "Mu", (char) 924, "Nu",
+      (char) 925, "Xi", (char) 926, "Omicron", (char) 927, "Pi", (char) 928, "Rho", (char) 929, "Sigma", (char) 931, "Tau", (char) 932, "Upsilon", (char) 933, "Phi", (char) 934, "Chi", (char) 935, "Psi", (char) 936, "Omega", (char) 937, "alpha",
+      (char) 945, "beta", (char) 946, "gamma", (char) 947, "delta", (char) 948, "epsilon", (char) 949, "zeta", (char) 950, "eta", (char) 951, "theta", (char) 952, "iota", (char) 953, "kappa", (char) 954, "lambda", (char) 955, "mu", (char) 956, "nu",
+      (char) 957, "xi", (char) 958, "omicron", (char) 959, "pi", (char) 960, "rho", (char) 961, "sigmaf", (char) 962, "sigma", (char) 963, "tau", (char) 964, "upsilon", (char) 965, "phi", (char) 966, "chi", (char) 967, "psi", (char) 968, "omega",
+      (char) 969, "thetasym", (char) 977, "upsih", (char) 978, "piv", (char) 982, "bull", (char) 8226, "hellip", (char) 8230, "prime", (char) 8242, "Prime", (char) 8243, "oline", (char) 8254, "frasl", (char) 8260, "weierp", (char) 8472, "image",
+      (char) 8465, "real", (char) 8476, "trade", (char) 8482, "alefsym", (char) 8501, "larr", (char) 8592, "uarr", (char) 8593, "rarr", (char) 8594, "darr", (char) 8595, "harr", (char) 8596, "crarr", (char) 8629, "lArr", (char) 8656, "uArr",
+      (char) 8657, "rArr", (char) 8658, "dArr", (char) 8659, "hArr", (char) 8660, "forall", (char) 8704, "part", (char) 8706, "exist", (char) 8707, "empty", (char) 8709, "nabla", (char) 8711, "isin", (char) 8712, "notin", (char) 8713, "ni",
+      (char) 8715, "prod", (char) 8719, "sum", (char) 8721, "minus", (char) 8722, "lowast", (char) 8727, "radic", (char) 8730, "prop", (char) 8733, "infin", (char) 8734, "ang", (char) 8736, "and", (char) 8743, "or", (char) 8744, "cap", (char) 8745,
+      "cup", (char) 8746, "int", (char) 8747, "there4", (char) 8756, "sim", (char) 8764, "cong", (char) 8773, "asymp", (char) 8776, "ne", (char) 8800, "equiv", (char) 8801, "le", (char) 8804, "ge", (char) 8805, "sub", (char) 8834, "sup",
+      (char) 8835, "nsub", (char) 8836, "sube", (char) 8838, "supe", (char) 8839, "oplus", (char) 8853, "otimes", (char) 8855, "perp", (char) 8869, "sdot", (char) 8901, "lceil", (char) 8968, "rceil", (char) 8969, "lfloor", (char) 8970, "rfloor",
+      (char) 8971, "lang", (char) 9001, "rang", (char) 9002, "loz", (char) 9674, "spades", (char) 9824, "clubs", (char) 9827, "hearts", (char) 9829, "diams", (char) 9830, "quot", (char) 34, "amp", (char) 38, "lt", (char) 60, "gt", (char) 62,
+      "OElig", (char) 338, "oelig", (char) 339, "Scaron", (char) 352, "scaron", (char) 353, "Yuml", (char) 376, "circ", (char) 710, "tilde", (char) 732, "ensp", (char) 8194, "emsp", (char) 8195, "thinsp", (char) 8201, "zwnj", (char) 8204, "zwj",
+      (char) 8205, "lrm", (char) 8206, "rlm", (char) 8207, "ndash",
+      (char) 8211, //0x2013
+      "mdash", (char) 8212, "lsquo", (char) 8216, "rsquo", (char) 8217, "sbquo", (char) 8218, "ldquo", (char) 8220, "rdquo", (char) 8221, "bdquo", (char) 8222, "dagger", (char) 8224, "Dagger", (char) 8225, "permil", (char) 8240, "lsaquo",
+      (char) 8249, "rsaquo", (char) 8250, "euro", (char) 8364, "apos", '\'');
+
+  /** Maps characters to normalizations */
+  public static Map<Character, String> normalizeMap = new TreeMap<Character, String>();
+  static {
+    Object[] o = new Object[] {
+        // ASCII
+        (char) 7,
+        "BEEP",
+        (char) 9,
+        " ",
+        (char) 10,
+        "\n",
+
+        // Latin-1
+        (char) 160,
+        " ",
+        (char) 161,
+        "!",
+        (char) 162,
+        "cent",
+        (char) 163,
+        "pound",
+        (char) 164,
+        "currency",
+        (char) 165,
+        "yen",
+        (char) 166,
+        "|",
+        (char) 167,
+        "/",
+        (char) 169,
+        "(c)",
+        (char) 170,
+        "^a",
+        (char) 171,
+        "\"",
+        (char) 172,
+        "~",
+        (char) 173,
+        "",
+        (char) 174,
+        "(R)",
+        (char) 176,
+        "degree",
+        (char) 177,
+        "+/-",
+        (char) 178,
+        "^2",
+        (char) 179,
+        "^3",
+        (char) 180,
+        "'",
+        (char) 181,
+        "mu",
+        (char) 182,
+        "P",
+        (char) 183,
+        ".",
+        (char) 184,
+        ",",
+        (char) 185,
+        "^1",
+        (char) 186,
+        "^o",
+        (char) 187,
+        "\"",
+        (char) 188,
+        "1/4",
+        (char) 189,
+        "1/2",
+        (char) 190,
+        "3/4",
+        (char) 191,
+        "?",
+        (char) 0xC4,
+        "Ae",
+        (char) 0xD6,
+        "Oe",
+        (char) 0xDC,
+        "Ue",
+        (char) 0xDF,
+        "ss",
+        (char) 0xC6,
+        "Ae",
+        (char) 0xC7,
+        "C",
+        (char) 0xD0,
+        "D",
+        (char) 0xD1,
+        "N",
+        (char) 0xD7,
+        "x",
+        (char) 0xDD,
+        "Y",
+        (char) 0xDE,
+        "b",
+        (char) 0xF7,
+        "/",
+        (char) 0xFF,
+        "y",
+
+        // Latin Extended-A
+        (char) 0x132,
+        "IJ",
+        (char) 0x134,
+        "J",
+        (char) 0x170,
+        "Ue",
+        (char) 0x174,
+        "W",
+        (char) 0x17F,
+        "f",
+
+        // Greek
+        (char) 0x374,
+        "'",
+        (char) 0x375,
+        ",",
+        (char) 0x37A,
+        ",",
+        (char) 0x37E,
+        ";",
+        (char) 0x384,
+        "'",
+        (char) 0x385,
+        "'",
+        (char) 0x386,
+        "A",
+        (char) 0x387,
+        ".",
+        (char) 0x388,
+        "E",
+        (char) 0x380,
+        "I",
+        (char) 0x38C,
+        "O",
+        (char) 0x38E,
+        "Y",
+        (char) 0x38F,
+        "O",
+        (char) 0x390,
+        "i",
+        (char) 215,
+        "*",
+        (char) 913,
+        "A",
+        (char) 914,
+        "B",
+        (char) 915,
+        "G",
+        (char) 916,
+        "D",
+        (char) 917,
+        "E",
+        (char) 918,
+        "Z",
+        (char) 919,
+        "E",
+        (char) 920,
+        "Th",
+        (char) 921,
+        "I",
+        (char) 922,
+        "K",
+        (char) 923,
+        "L",
+        (char) 924,
+        "M",
+        (char) 925,
+        "N",
+        (char) 926,
+        "X",
+        (char) 927,
+        "O",
+        (char) 928,
+        "P",
+        (char) 929,
+        "R",
+        (char) 931,
+        "S",
+        (char) 932,
+        "T",
+        (char) 933,
+        "Y",
+        (char) 934,
+        "Ph",
+        (char) 935,
+        "Ch",
+        (char) 936,
+        "Ps",
+        (char) 937,
+        "O",
+        (char) 977,
+        "th",
+        (char) 978,
+        "y",
+        (char) 982,
+        "pi",
+
+        // General Punctuation
+        (char) 0x2013,
+        "-",
+        (char) 0x2016,
+        "||",
+        (char) 0x2017,
+        "_",
+        (char) 0x2020,
+        "+",
+        (char) 0x2021,
+        "++",
+        (char) 0x2022,
+        "*",
+        (char) 0x2023,
+        "*",
+        (char) 0x2024,
+        ".",
+        (char) 0x2025,
+        "..",
+        (char) 0x2026,
+        "...",
+        (char) 0x2027,
+        ".",
+        (char) 0x2028,
+        "\n",
+        (char) 0x2030,
+        "/1000",
+        (char) 0x2031,
+        "/10000",
+        (char) 0x2032,
+        "'",
+        (char) 0x2033,
+        "''",
+        (char) 0x2034,
+        "'''",
+        (char) 0x2035,
+        "'",
+        (char) 0x2036,
+        "''",
+        (char) 0x2037,
+        "'''",
+        (char) 0x2038,
+        "^",
+        (char) 0x2039,
+        "\"",
+        (char) 0x203A,
+        "\"",
+        (char) 0x203B,
+        "*",
+        (char) 0x203C,
+        "!!",
+        (char) 0x203D,
+        "?!",
+        (char) 0x2041,
+        ",",
+        (char) 0x2042,
+        "***",
+        (char) 0x2043,
+        "-",
+        (char) 0x2044,
+        "/",
+        (char) 0x2045,
+        "[",
+        (char) 0x2046,
+        "]",
+        (char) 0x2047,
+        "??",
+        (char) 0x2048,
+        "?!",
+        (char) 0x2049,
+        "!?",
+        (char) 0x204A,
+        "-",
+        (char) 0x204B,
+        "P",
+        (char) 0x204C,
+        "<",
+        (char) 0x204D,
+        ">",
+        (char) 0x204F,
+        ";",
+        (char) 0x2050,
+        "-",
+        (char) 0x2051,
+        "**",
+        (char) 0x2052,
+        "./.",
+        (char) 0x2053,
+        "~",
+        (char) 0x2054,
+        "_",
+        (char) 0x2055,
+        "_",
+
+        // Mathematical symbols
+        (char) 8465,
+        "I",
+        (char) 8476,
+        "R",
+        (char) 8482,
+        "(TM)",
+        (char) 8501,
+        "a",
+        (char) 8592,
+        "<-",
+        (char) 8593,
+        "^",
+        (char) 8594,
+        "->",
+        (char) 8595,
+        "v",
+        (char) 8596,
+        "<->",
+        (char) 8629,
+        "<-'",
+        (char) 8656,
+        "<=",
+        (char) 8657,
+        "^",
+        (char) 8658,
+        "=>",
+        (char) 8659,
+        "v",
+        (char) 8660,
+        "<=>",
+        (char) 8704,
+        "FOR ALL",
+        (char) 8706,
+        "d",
+        (char) 8707,
+        "EXIST",
+        (char) 8709,
+        "{}",
+        (char) 8712,
+        "IN",
+        (char) 8713,
+        "NOT IN",
+        (char) 8715,
+        "CONTAINS",
+        (char) 8719,
+        "PRODUCT",
+        (char) 8721,
+        "SUM",
+        (char) 8722,
+        "-",
+        (char) 8727,
+        "*",
+        (char) 8730,
+        "SQRT",
+        (char) 8733,
+        "~",
+        (char) 8734,
+        "INF",
+        (char) 8736,
+        "angle",
+        (char) 8743,
+        "&",
+        (char) 8744,
+        "|",
+        (char) 8745,
+        "INTERSECTION",
+        (char) 8746,
+        "UNION",
+        (char) 8747,
+        "INTEGRAL",
+        (char) 8756,
+        "=>",
+        (char) 8764,
+        "~",
+        (char) 8773,
+        "~=",
+        (char) 8776,
+        "~=",
+        (char) 8800,
+        "!=",
+        (char) 8801,
+        "==",
+        (char) 8804,
+        "=<",
+        (char) 8805,
+        ">=",
+        (char) 8834,
+        "SUBSET OF",
+        (char) 8835,
+        "SUPERSET OF",
+        (char) 8836,
+        "NOT SUBSET OF",
+        (char) 8838,
+        "SUBSET OR EQUAL",
+        (char) 8839,
+        "SUPERSET OR EQUAL",
+        (char) 8853,
+        "(+)",
+        (char) 8855,
+        "(*)",
+        (char) 8869,
+        "_|_",
+        (char) 8901,
+        "*",
+        (char) 8364,
+        "EUR",
+
+        // Ligatures
+        (char) 0xFB00,
+        "ff",
+        (char) 0xFB01,
+        "fi",
+        (char) 0xFB02,
+        "fl",
+        (char) 0xFB03,
+        "ffi",
+        (char) 0xFB04,
+        "ffl",
+        (char) 0xFB05,
+        "ft",
+        (char) 0xFB06,
+        "st" };
+    for (int i = 0; i < o.length; i += 2)
+      normalizeMap.put((Character) o[i], (String) o[i + 1]);
+  }
+
+  /** Normalizes a character to a String of characters in the range 0x20-0x7F.
+   *  Returns a String, because some characters are
+    * normalized to multiple characters (e.g. umlauts) and
+    * some characters are normalized to zero characters (e.g. special Unicode space chars).
+    * Returns null for the EndOfFile character -1 */
+  public static String normalize(int c) {
+    // EOF
+    if (c == -1) return (null);
+
+    // ASCII chars
+    if (c >= ' ' && c <= 128) return ("" + (char) c);
+
+    // Upper case
+    boolean u = Character.isUpperCase(c);
+    char cu = (char) Character.toUpperCase(c);
+
+    // Check map
+    if (normalizeMap.get(cu) != null) return (u ? normalizeMap.get(cu) : normalizeMap.get(cu).toLowerCase());
+
+    // ASCII
+    if (c < ' ') return ("");
+
+    // Latin-1
+    if (cu >= 0xC0 && cu <= 0xC5) return (u ? "A" : "a");
+    if (cu >= 0xC8 && cu <= 0xCB) return (u ? "E" : "e");
+    if (cu >= 0xCC && cu <= 0xCF) return (u ? "I" : "i");
+    if (cu >= 0xD2 && cu <= 0xD8) return (u ? "O" : "o");
+    if (cu >= 0x80 && cu <= 0xA0) return (" ");
+
+    //  Latin Extended-A
+    if (cu >= 0x100 && cu <= 0x105) return (u ? "A" : "a");
+    if (cu >= 0x106 && cu <= 0x10D) return (u ? "C" : "c");
+    if (cu >= 0x10E && cu <= 0x111) return (u ? "D" : "d");
+    if (cu >= 0x112 && cu <= 0x11B) return (u ? "E" : "e");
+    if (cu >= 0x11C && cu <= 0x123) return (u ? "G" : "g");
+    if (cu >= 0x124 && cu <= 0x127) return (u ? "H" : "h");
+    if (cu >= 0x128 && cu <= 0x131) return (u ? "I" : "i");
+    if (cu >= 0x136 && cu <= 0x138) return (u ? "K" : "k");
+    if (cu >= 0x139 && cu <= 0x142) return (u ? "L" : "l");
+    if (cu >= 0x143 && cu <= 0x14B) return (u ? "N" : "n");
+    if (cu >= 0x14C && cu <= 0x14F) return (u ? "O" : "o");
+    if (cu >= 0x150 && cu <= 0x153) return (u ? "Oe" : "oe");
+    if (cu >= 0x156 && cu <= 0x159) return (u ? "R" : "r");
+    if (cu >= 0x15A && cu <= 0x161) return (u ? "S" : "s");
+    if (cu >= 0x161 && cu <= 0x167) return (u ? "T" : "t");
+    if (cu >= 0x176 && cu <= 0x178) return (u ? "Y" : "y");
+    if (cu >= 0x179 && cu <= 0x17E) return (u ? "Z" : "z");
+
+    // General Punctuation
+    if (cu >= 0x2000 && cu <= 0x200A) return (" ");
+    if (cu >= 0x200B && cu <= 0x200F) return ("");
+    if (cu >= 0x2010 && cu <= 0x2015) return ("--");
+    if (cu >= 0x2018 && cu <= 0x201B) return ("'");
+    if (cu >= 0x201C && cu <= 0x201F) return ("\"");
+    if (cu >= 0x2029 && cu <= 0x202F) return (" ");
+    if (cu >= 0x203E && cu <= 0x2040) return ("-");
+    if (cu >= 0x2056 && cu <= 0x205E) return (".");
+
+    return (defaultNormalizer.apply((char) c));
+  }
+
+  /** Eats a String of the form "%xx" from a string, where
+   * xx is a hexadecimal code. If xx is a UTF8 code start, 
+   * tries to complete the UTF8-code and decodes it.*/
+  public static char eatPercentage(String a, int[] n) {
+    // Length 0
+    if (!a.startsWith("%") || a.length() < 3) {
+      n[0] = 0;
+      return ((char) 0);
+    }
+    char c;
+    // Try to parse first char
+    try {
+      c = (char) Integer.parseInt(a.substring(1, 3), 16);
+    } catch (Exception e) {
+      n[0] = -1;
+      return ((char) 0);
+    }
+    // For non-UTF8, return the char    
+    int len = Utf8Length(c);
+    n[0] = 3;
+    if (len <= 1) return (c);
+    // Else collect the UTF8
+    String dec = "" + c;
+    for (int i = 1; i < len; i++) {
+      try {
+        dec += (char) Integer.parseInt(a.substring(1 + i * 3, 3 + i * 3), 16);
+      } catch (Exception e) {
+        return (c);
+      }
+    }
+    // Try to decode the UTF8
+    int[] eatLength = new int[1];
+    char utf8 = eatUtf8(dec, eatLength);
+    if (eatLength[0] != len) return (c);
+    n[0] = len * 3;
+    return (utf8);
+  }
+
+  /** Eats an HTML ampersand code from a String*/
+  public static char eatAmpersand(String a, int[] n) {
+    n[0] = 0;
+    if (!a.startsWith("&")) return ((char) 0);
+    // Seek to ';'
+    // We also accept spaces and the end of the String as a delimiter
+    while (n[0] < a.length() && !Character.isSpaceChar(a.charAt(n[0])) && a.charAt(n[0]) != ';')
+      n[0]++;
+    if (n[0] <= 1) {
+      n[0] = -1;
+      return ((char) 0);
+    }
+    if (n[0] < a.length() && a.charAt(n[0]) == ';') {
+      a = a.substring(1, n[0]);
+      n[0]++;
+    } else {
+      a = a.substring(1, n[0]);
+    }
+    // Hexadecimal characters
+    if (a.startsWith("#x")) {
+      try {
+        return ((char) Integer.parseInt(a.substring(2), 16));
+      } catch (Exception e) {
+        n[0] = -1;
+        return ((char) 0);
+      }
+    }
+    // Decimal characters
+    if (a.startsWith("#")) {
+      try {
+        return ((char) Integer.parseInt(a.substring(1)));
+      } catch (Exception e) {
+        n[0] = -1;
+        return ((char) 0);
+      }
+    }
+    // Others
+    if (ampersandMap.get(a) != null) return (ampersandMap.get(a));
+    else if (ampersandMap.get(a.toLowerCase()) != null) return (ampersandMap.get(a.toLowerCase()));
+    n[0] = -1;
+    return ((char) 0);
+  }
+
+  /** Tells from the first UTF-8 code character how long the code is.
+   * Returns -1 if the character is not an UTF-8 code start.
+   * Returns 1 if the character is ASCII<128*/
+  public static int Utf8Length(char c) {
+    // 0xxx xxxx
+    if ((c & 0x80) == 0x00) return (1);
+    // 110x xxxx
+    if ((c & 0xE0) == 0xC0) return (2);
+    // 1110 xxxx
+    if ((c & 0xF0) == 0xE0) return (3);
+    // 1111 0xxx
+    if ((c & 0xF8) == 0xF0) return (4);
+    return (-1);
+  }
+
+  /** Eats a UTF8 code from a String. There is also a built-in way in Java that converts
+   * UTF8 to characters and back, but it does not work with all characters. */
+  public static char eatUtf8(String a, int[] n) {
+    if (a.length() == 0) {
+      n[0] = 0;
+      return ((char) 0);
+    }
+    n[0] = Utf8Length(a.charAt(0));
+    if (a.length() >= n[0]) {
+      switch (n[0]) {
+        case 1:
+          return (a.charAt(0));
+        case 2:
+          if ((a.charAt(1) & 0xC0) != 0x80) break;
+          return ((char) (((a.charAt(0) & 0x1F) << 6) + (a.charAt(1) & 0x3F)));
+        case 3:
+          if ((a.charAt(1) & 0xC0) != 0x80 || (a.charAt(2) & 0xC0) != 0x80) break;
+          return ((char) (((a.charAt(0) & 0x0F) << 12) + ((a.charAt(1) & 0x3F) << 6) + ((a.charAt(2) & 0x3F))));
+        case 4:
+          if ((a.charAt(1) & 0xC0) != 0x80 || (a.charAt(2) & 0xC0) != 0x80 || (a.charAt(3) & 0xC0) != 0x80) break;
+          return ((char) (((a.charAt(0) & 0x07) << 18) + ((a.charAt(1) & 0x3F) << 12) + ((a.charAt(2) & 0x3F) << 6) + ((a.charAt(3) & 0x3F))));
+      }
+    }
+    n[0] = -1;
+    return ((char) 0);
+  }
+
+  /** Decodes all UTF8 characters in the string*/
+  public static String decodeUTF8(String s) {
+    StringBuilder result = new StringBuilder();
+    int[] eatLength = new int[1];
+    while (s.length() != 0) {
+      char c = eatUtf8(s, eatLength);
+      if (eatLength[0] != -1) {
+        result.append(c);
+        s = s.substring(eatLength[0]);
+      } else {
+        result.append(s.charAt(0));
+        s = s.substring(1);
+      }
+    }
+    return (result.toString());
+  }
+
+  /** Decodes all percentage characters in the string*/
+  public static String decodePercentage(String s) {
+    StringBuilder result = new StringBuilder();
+    int[] eatLength = new int[1];
+    while (s.length() != 0) {
+      char c = eatPercentage(s, eatLength);
+      if (eatLength[0] > 1) {
+        result.append(c);
+        s = s.substring(eatLength[0]);
+      } else {
+        result.append(s.charAt(0));
+        s = s.substring(1);
+      }
+    }
+    return (result.toString());
+  }
+
+  /** Fabian: This method cannot decode numeric hexadecimal ampersand codes. What is its purpose? TODO*/
+  public static String decodeAmpersand_UNKNOWN(String s) {
+    if (s == null) {
+      return null;
+    }
+    StringBuffer sb = new StringBuffer(s.length());
+    while (s != null && s.length() != 0) {
+      int i = s.indexOf("&");
+      if (i == -1) {
+        sb.append(s);
+        s = null;
+      } else {
+        boolean space = false;
+        boolean end = false;
+        sb.append(s.substring(0, i));
+        s = s.substring(i);
+        int j1 = s.indexOf(";");
+        int j2 = s.indexOf(" ");
+        int j = -1;
+        if (j1 == -1 || j2 == -1) {
+          if (j1 == -1 && j2 == -1) {
+            end = true;
+            j = s.length();
+          } else if (j1 == -1) {
+            j = j2;
+          } else if (j2 == -1) {
+            j = j1;
+          }
+        } else if (j1 < j2) {
+          j = j1;
+        } else if (j1 > j2) {
+          j = j2;
+          space = true;
+        }
+        String a = s.substring(1, j);
+        if (ampersandMap.get(a) != null) {
+          sb.append(ampersandMap.get(a));
+          if (space) {
+            sb.append(' ');
+          }
+        } else if (a.startsWith("#")) {
+          try {
+            sb.append(((char) Integer.parseInt(a.substring(1))));
+          } catch (Exception e) {
+            sb.append(a);
+          }
+          if (space) {
+            sb.append(' ');
+          }
+        } else {
+          if (end) {
+            sb.append(s.substring(0, j));
+          } else {
+            sb.append(s.substring(0, j + 1));
+          }
+        }
+        if (end) {
+          s = s.substring(j);
+        } else {
+          s = s.substring(j + 1);
+        }
+      }
+    }
+    return sb.toString();
+  }
+
+  public static String decodeAmpersand(String s, PositionTracker posTracker) {
+    if (s == null) {
+      return null;
+    }
+    int pos = 0;
+    int difference;
+    StringBuffer sb = new StringBuffer(s.length());
+    while (s != null && s.length() != 0) {
+      int i = s.indexOf("&");
+      if (i == -1) {
+        sb.append(s);
+        s = null;
+      } else {
+        boolean space = false;
+        boolean end = false;
+        sb.append(s.substring(0, i));
+        s = s.substring(i);
+        pos += i;
+        int j1 = s.indexOf(";");
+        int j2 = s.indexOf(" ");
+        int j = -1;
+        if (j1 == -1 || j2 == -1) {
+          if (j1 == -1 && j2 == -1) {
+            end = true;
+            j = s.length();
+          } else if (j1 == -1) {
+            j = j2;
+          } else if (j2 == -1) {
+            j = j1;
+          }
+        } else if (j1 < j2) {
+          j = j1;
+        } else if (j1 > j2) {
+          j = j2;
+          space = true;
+        }
+        pos += (j + 1);
+        String a = s.substring(1, j);
+        if (ampersandMap.get(a) != null) {
+          sb.append(ampersandMap.get(a));
+          difference = 1 - (j + 1);
+          if (space) {
+            sb.append(' ');
+            difference++;
+          }
+          posTracker.addPositionChange(pos, difference);
+        } else {
+          if (end) {
+            sb.append(s.substring(0, j));
+          } else {
+            sb.append(s.substring(0, j + 1));
+          }
+        }
+        if (end) {
+          s = s.substring(j);
+        } else {
+          s = s.substring(j + 1);
+        }
+      }
+    }
+    posTracker.closeRun();
+    return sb.toString();
+  }
+
+  /** Decodes all ampersand sequences in the string*/
+  public static String decodeAmpersand(String s) {
+    StringBuilder result = new StringBuilder();
+    int[] eatLength = new int[1];// add this in order to multithread safe
+    while (s.length() != 0) {
+      char c = eatAmpersand(s, eatLength);
+      if (eatLength[0] > 1) {
+        result.append(c);
+        s = s.substring(eatLength[0]);
+      } else {
+        result.append(s.charAt(0));
+        s = s.substring(1);
+      }
+    }
+    return (result.toString());
+  }
+
+  /** Decodes all backslash characters in the string */
+  public static String decodeBackslash(String s) {
+    StringBuilder result = new StringBuilder();
+    int[] eatLength = new int[1];
+    while (s.length() != 0) {
+      char c = eatBackslash(s, eatLength);
+      if (eatLength[0] > 1) {
+        result.append(c);
+        s = s.substring(eatLength[0]);
+      } else {
+        result.append(s.charAt(0));
+        s = s.substring(1);
+      }
+    }
+    return (result.toString());
+  }
+
+  /** Eats a backslash sequence from a String */
+  public static char eatBackslash(String a, int[] n) {
+    if (!a.startsWith("\\")) {
+      n[0] = 0;
+      return ((char) 0);
+    }
+    // Unicodes BS u XXXX
+    if (a.startsWith("\\u")) {
+      try {
+        n[0] = 6;
+        return ((char) Integer.parseInt(a.substring(2, 6), 16));
+      } catch (Exception e) {
+        n[0] = -1;
+        return ((char) 0);
+      }
+    }
+    // Unicodes BS uu XXXX
+    if (a.startsWith("\\uu")) {
+      try {
+        n[0] = 7;
+        return ((char) Integer.parseInt(a.substring(3, 7), 16));
+      } catch (Exception e) {
+        n[0] = -1;
+        return ((char) 0);
+      }
+    }
+    // Classical escape sequences
+    if (a.startsWith("\\b")) {
+      n[0] = 2;
+      return ((char) 8);
+    }
+    if (a.startsWith("\\t")) {
+      n[0] = 2;
+      return ((char) 9);
+    }
+    if (a.startsWith("\\n")) {
+      n[0] = 2;
+      return ((char) 10);
+    }
+    if (a.startsWith("\\f")) {
+      n[0] = 2;
+      return ((char) 12);
+    }
+    if (a.startsWith("\\r")) {
+      n[0] = 2;
+      return ((char) 13);
+    }
+    if (a.startsWith("\\\\")) {
+      n[0] = 2;
+      return ('\\');
+    }
+    if (a.startsWith("\\\"")) {
+      n[0] = 2;
+      return ('"');
+    }
+    if (a.startsWith("\\'")) {
+      n[0] = 2;
+      return ('\'');
+    }
+    // Octal codes
+    n[0] = 1;
+    while (n[0] < a.length() && a.charAt(n[0]) >= '0' && a.charAt(n[0]) <= '8')
+      n[0]++;
+    if (n[0] == 1) {
+      n[0] = 0;
+      return ((char) 0);
+    }
+    try {
+      return ((char) Integer.parseInt(a.substring(1, n[0]), 8));
+    } catch (Exception e) {
+    }
+    n[0] = -1;
+    return ((char) 0);
+  }
+
+  /** Replaces all codes in a String by the 16 bit Unicode characters */
+  public static String decode(String s) {
+    StringBuilder b = new StringBuilder();
+    int[] eatLength = new int[1];
+    while (s.length() > 0) {
+      char c = eatPercentage(s, eatLength);
+      if (eatLength[0] <= 0) {
+        c = eatAmpersand(s, eatLength);
+        if (eatLength[0] <= 0) {
+          c = eatBackslash(s, eatLength);
+          if (eatLength[0] <= 0) {
+            c = eatUtf8(s, eatLength);
+            if (eatLength[0] <= 0) {
+              c = s.charAt(0);
+              eatLength[0] = 1;
+            }
+          }
+        }
+      }
+      b.append(c);
+      s = s.substring(eatLength[0]);
+    }
+    return (b.toString());
+  }
+
+  /** Encodes a character to UTF8 (if necessary)*/
+  public static String encodeUTF8(int c) {
+    if (c <= 0x7F) return ("" + (char) c);
+    if (c <= 0x7FF) return ("" + (char) (0xC0 + ((c >> 6) & 0x1F)) + (char) (0x80 + (c & 0x3F)));
+    if (c <= 0xFFFF) return ("" + (char) (0xE0 + ((c >> 12) & 0x0F)) + (char) (0x80 + ((c >> 6) & 0x3F)) + (char) (0x80 + (c & 0x3F)));
+    return ("" + c);
+  }
+
+  /** Encodes a character to a backslash code (if necessary)*/
+  public static String encodeBackslash(char c) {
+    if (isAlphanumeric(c) || c == ' ') return ("" + c);
+    String hex = Integer.toHexString(c);
+    while (hex.length() < 4)
+      hex = "0" + hex;
+    return ("\\u" + hex);
+  }
+
+  /** Encodes a character to a backslash code (if not alphanumeric)*/
+  public static String encodeBackslashToAlphanumeric(char c) {
+    if (isAlphanumeric(c) || c == '_') return ("" + c);
+    String hex = Integer.toHexString(c);
+    while (hex.length() < 4)
+      hex = "0" + hex;
+    return ("\\u" + hex);
+  }
+
+  /** Encodes a character to a backslash code (if not ASCII)*/
+  public static String encodeBackslashToASCII(char c) {
+    if (c >= 32 && c < 128 && c != '\\' && c != '"') return ("" + c);
+    String hex = Integer.toHexString(c);
+    while (hex.length() < 4)
+      hex = "0" + hex;
+    return ("\\u" + hex);
+  }
+
+  /** Encodes a character to an HTML-Ampersand code (if necessary)*/
+  public static String encodeAmpersand(char c) {
+    String s;
+    if (null != (s = charToAmpersand.get(c))) return (s);
+    if (c < 128 && c >= 32) return ("" + c);
+    else return ("&#" + ((int) c) + ";");
+  }
+
+  /** Encodes a character to an HTML-Ampersand code (if necessary)*/
+  public static String encodeAmpersandToAlphanumeric(char c) {
+    if (isAlphanumeric(c) || c == '_') return ("" + c);
+    return ("&#" + ((int) c) + ";");
+  }
+
+  /** Encodes a character to an Percentage code (if necessary).
+   * If the character is greater than 0x80, the character is converted to
+   * a UTF8-sequence and this sequence is encoded as percentage codes. */
+  public static String encodePercentage(char c) {
+    if (isAlphanumeric(c)) return ("" + c);
+    if (c < 16) return ("%0" + Integer.toHexString(c).toUpperCase());
+    if (c < 128) return ("%" + Integer.toHexString(c).toUpperCase());
+    String s = encodeUTF8(c);
+    String result = "";
+    for (int i = 0; i < s.length(); i++) {
+      result += "%" + Integer.toHexString(s.charAt(i)).toUpperCase();
+    }
+    return (result);
+  }
+
+  /**
+   * Encodes a String with reserved XML characters into a valid xml string for attributes.  
+   * @param str
+   * @return 
+   */
+  public static String encodeXmlAttribute(String str) {
+    if (str == null) return null;
+    int len = str.length();
+    if (len == 0) return str;
+    StringBuffer encoded = new StringBuffer();
+    for (int i = 0; i < len; i++) {
+      char c = str.charAt(i);
+      if (c == '<') encoded.append("&lt;");
+      else if (c == '\"') encoded.append("&quot;");
+      else if (c == '>') encoded.append("&gt;");
+      else if (c == '\'') encoded.append("&apos;");
+      else if (c == '&') encoded.append("&amp;");
+      else encoded.append(c);
+    }
+    return encoded.toString();
+  }
+
+  /** Tells whether a char is in a range*/
+  public static boolean in(char c, char a, char b) {
+    return (c >= a && c <= b);
+  }
+
+  /** Tells whether a char is in a string*/
+  public static boolean in(char c, String s) {
+    return (s.indexOf(c) != -1);
+  }
+
+  /** Tells whether a char is alphanumeric in the sense of URIs*/
+  public static boolean isAlphanumeric(char c) {
+    return (in(c, 'a', 'z') || in(c, 'A', 'Z') || in(c, '0', '9'));
+  }
+
+  /** Tells whether a char is reserved in the sense of URIs*/
+  public static boolean isReserved(char c) {
+    return (isSubDelim(c) || isGenDelim(c));
+  }
+
+  /** Tells whether a char is unreserved in the sense of URIs (not the same as !reserved)*/
+  public static boolean isUnreserved(char c) {
+    return (isAlphanumeric(c) || in(c, "-._~"));
+  }
+
+  /** Tells whether a string is escaped in the sense of URIs*/
+  public static boolean isEscaped(String s) {
+    return (s.matches("%[0-9A-Fa-f]{2}"));
+  }
+
+  /** Tells whether a char is a sub-delimiter in the sense of URIs*/
+  public static boolean isSubDelim(char c) {
+    return (in(c, "!$&'()*+,="));
+  }
+
+  /** Tells whether a char is a general delimiter in the sense of URIs*/
+  public static boolean isGenDelim(char c) {
+    return (in(c, ":/?#[]@"));
+  }
+
+  /** Tells whether a char is a valid path component in the sense of URIs*/
+  public static boolean isPchar(char c) {
+    return (isUnreserved(c) || isSubDelim(c) || in(c, "@"));
+  }
+
+  /** Encodes a char to percentage code, if it is not a path character in the sense of URIs*/
+  public static String encodeURIPathComponent(char c) {
+    if (isPchar(c)) return ("" + c);
+    else return (Char.encodePercentage(c));
+  }
+
+  /** Encodes a char to percentage code, if it is not a path character in the sense of URIs*/
+  public static String encodeURIPathComponent(String s) {
+    StringBuilder result = new StringBuilder();
+    for (int i = 0; i < s.length(); i++) {
+      result.append(Char.encodeURIPathComponent(s.charAt(i)));
+    }
+    return (result.toString());
+  }
+
+  /** Encodes a char to percentage code, if it is not a path character in the sense of XMLs*/
+  public static String encodeURIPathComponentXML(String s) {
+    StringBuilder result = new StringBuilder();
+    for (int i = 0; i < s.length(); i++) {
+      if (s.charAt(i) == '&') result.append(Char.encodePercentage(s.charAt(i)));
+      else if (s.charAt(i) == '"') result.append(Char.encodePercentage(s.charAt(i)));
+      else result.append(Char.encodeURIPathComponent(s.charAt(i)));
+    }
+    return (result.toString());
+  }
+
+  /** Decodes a URI path component*/
+  public static String decodeURIPathComponent(String s) {
+    return (Char.decodePercentage(s));
+  }
+
+  /** Encodes a String to UTF8 */
+  public static String encodeUTF8(String c) {
+    StringBuilder r = new StringBuilder();
+    for (int i = 0; i < c.length(); i++) {
+      r.append(encodeUTF8(c.charAt(i)));
+    }
+    return (r.toString());
+  }
+
+  /** Replaces non-normal characters in a String by Backslash codes */
+  public static String encodeBackslash(String c) {
+    StringBuilder r = new StringBuilder();
+    for (int i = 0; i < c.length(); i++) {
+      r.append(encodeBackslash(c.charAt(i)));
+    }
+    return (r.toString());
+  }
+
+  /** Replaces non-normal characters in a String by Backslash codes (if not alphanumeric)*/
+  public static String encodeBackslashToAlphanumeric(String c) {
+    StringBuilder r = new StringBuilder();
+    for (int i = 0; i < c.length(); i++) {
+      r.append(encodeBackslashToAlphanumeric(c.charAt(i)));
+    }
+    return (r.toString());
+  }
+
+  /** Replaces non-normal characters in a String by Backslash codes (if not ASCII)*/
+  public static String encodeBackslashToASCII(String c) {
+    StringBuilder r = new StringBuilder();
+    for (int i = 0; i < c.length(); i++) {
+      r.append(encodeBackslashToASCII(c.charAt(i)));
+    }
+    return (r.toString());
+  }
+
+  /** Replaces non-normal characters in a String by HTML Ampersand codes */
+  public static String encodeAmpersand(String c) {
+    StringBuilder r = new StringBuilder();
+    for (int i = 0; i < c.length(); i++) {
+      r.append(encodeAmpersand(c.charAt(i)));
+    }
+    return (r.toString());
+  }
+
+  /** Replaces non-normal characters in a String by HTML Ampersand codes */
+  public static String encodeAmpersandToAlphanumeric(String c) {
+    StringBuilder r = new StringBuilder();
+    for (int i = 0; i < c.length(); i++) {
+      r.append(encodeAmpersandToAlphanumeric(c.charAt(i)));
+    }
+    return (r.toString());
+  }
+
+  /** Replaces non-normal characters in a String by Percentage codes.
+   * If a character is greater than 0x80, the character is converted to
+   * a UTF8-sequence and this sequence is encoded as percentage codes. */
+  public static String encodePercentage(String c) {
+    StringBuilder r = new StringBuilder();
+    for (int i = 0; i < c.length(); i++) {
+      r.append(encodePercentage(c.charAt(i)));
+    }
+    return (r.toString());
+  }
+
+  /** Decodes all codes in a String and normalizes all chars */
+  public static String decodeAndNormalize(String s) {
+    return (normalize(decode(s)));
+  }
+
+  /** Normalizes all chars in a String to characters 0x20-0x7F */
+  public static String normalize(String s) {
+    StringBuilder b = new StringBuilder();
+    for (int i = 0; i < s.length(); i++)
+      b.append(normalize(s.charAt(i)));
+    return (b.toString());
+  }
+
+  /** Returns the last character of a String or 0*/
+  public static char last(CharSequence s) {
+    return (s.length() == 0 ? (char) 0 : s.charAt(s.length() - 1));
+  }
+
+  /** Returns the String without the last character */
+  public static String cutLast(String s) {
+    return (s.length() == 0 ? "" : s.substring(0, s.length() - 1));
+  }
+
+  /** Cuts the last character */
+  public static StringBuilder cutLast(StringBuilder s) {
+    s.setLength(s.length() - 1);
+    return (s);
+  }
+
+  /** Returns an HTML-String of the String */
+  public static String toHTML(String s) {
+    return (Char.encodeAmpersand(s).replace("&#10;", "<BR>"));
+  }
+
+  /** Returns the chars of a String in hex */
+  public static String hexAll(String s) {
+    StringBuilder result = new StringBuilder();
+    for (int i = 0; i < s.length(); i++) {
+      result.append(Integer.toHexString(s.charAt(i)).toUpperCase()).append(' ');
+    }
+    return (result.toString());
+  }
+
+  /** Replaces special characters in the string by hex codes (cannot be undone)*/
+  public static String encodeHex(String s) {
+    StringBuilder result = new StringBuilder();
+    for (int i = 0; i < s.length(); i++) {
+      char c = s.charAt(i);
+      if (isAlphanumeric(c)) result.append(c);
+      else result.append(Integer.toHexString(s.charAt(i)).toUpperCase());
+    }
+    return (result.toString());
+  }
+
+  /** Upcases the first character in a String*/
+  public static String upCaseFirst(String s) {
+    if (s == null || s.length() == 0) return (s);
+    return (Character.toUpperCase(s.charAt(0)) + s.substring(1));
+  }
+
+  /** Lowcases the first character in a String*/
+  public static String lowCaseFirst(String s) {
+    if (s == null || s.length() == 0) return (s);
+    return (Character.toLowerCase(s.charAt(0)) + s.substring(1));
+  }
+
+  /** Returns a string of the given length, fills with spaces if necessary */
+  public static CharSequence truncate(CharSequence s, int len) {
+    if (s.length() == len) return (s);
+    if (s.length() > len) return (s.subSequence(0, len));
+    StringBuilder result = new StringBuilder(s);
+    while (result.length() < len)
+      result.append(' ');
+    return (result);
+  }
+
+  /** Capitalizes words and lowercases the rest*/
+  public static String capitalize(String s) {
+    StringBuilder result = new StringBuilder();
+    for (int i = 0; i < s.length(); i++) {
+      char c = s.charAt(i);
+      if (i == 0 || i > 0 && !Character.isLetterOrDigit(s.charAt(i - 1))) c = Character.toUpperCase(c);
+      else c = Character.toLowerCase(c);
+      result.append(c);
+    }
+    return (result.toString());
+  }
+
+  /** TRUE if the Charsequence ends with the string */
+  public static boolean endsWith(CharSequence s, String end) {
+    return (s.length() >= end.length() && s.subSequence(s.length() - end.length(), s.length()).equals(end));
+  }
+
+  /** Test routine */
+  public static void main(String argv[]) throws Exception {
+    System.out.println("Enter a string with HTML ampersand codes, umlauts and/or UTF-8 codes and hit ENTER.");
+    System.out.println("Press CTRL+C to abort");
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    while (true) {
+      String s = in.readLine();
+      System.out.println("Decoded: " + (s = decode(s)));
+      System.out.println("Normalized: " + normalize(s));
+      System.out.println("As UTF8: " + encodeUTF8(s));
+      System.out.println("As percentage: " + encodePercentage(s));
+      System.out.println("As backslash: " + encodeBackslash(s));
+      System.out.println("As ampersand: " + encodeAmpersand(s));
+      System.out.println("As URI component: " + encodeURIPathComponent(s));
+    }
+  }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/NounGroup.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/NounGroup.java
new file mode 100644
index 0000000..1596ec8
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/NounGroup.java
@@ -0,0 +1,263 @@
+package org.yago.javatools.parsers;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Set;
+
+import org.yago.javatools.administrative.D;
+import org.yago.javatools.datatypes.FinalSet;
+
+/** 
+This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+It is licensed under the Creative Commons Attribution License 
+(see http://creativecommons.org/licenses/by/3.0) by 
+the YAGO-NAGA team (see http://mpii.de/yago-naga).
+  
+
+  
+ 
+
+The class NounGroup splits a noun group (given by a String) into its
+modifiers and its head.<BR>
+Example:
+<PRE>
+     System.out.println(new NounGroup("the United States of America").description());
+     ->
+      NounGroup:
+        Original: the_United_States_of_America
+        Determiner: the
+        Head: State
+        Plural: true
+        preModifiers: United
+        Adjective: 
+        Preposition: of
+        postModifier:
+          NounGroup:
+            Original: America
+            Determiner:
+            Head: America
+            Plural: false
+            preModifiers:
+            Preposition:
+            postModifier:
+</PRE>
+*/
+public class NounGroup {
+
+  /** Defines just one function from a String to a boolean */
+  public interface String2Boolean {
+    /** Function from a String to a boolean */
+    boolean apply(String s);
+  }
+
+  /** Tells whether a word is an adjective (currently by a simple heuristics */
+  public static String2Boolean isAdjective=new String2Boolean() {
+     public boolean apply(String s) {
+       return(s.length()>0 && Character.isLowerCase(s.charAt(0)) &&
+              (s.endsWith("al") || s.endsWith("ed") || s.endsWith("ing")));
+     }
+  };
+
+  /** Contains determiners*/
+  public static final Set<String> determiners=new FinalSet<String>(
+        "the",
+        "a",
+        "an",
+        "this",
+        "these",
+        "those"
+        );
+
+  /** Holds prepositions (like "of" etc.) */
+  public static final FinalSet<String> prepositions=new FinalSet<String>(
+        ",",
+        "at",
+        "about",
+        "and",
+        "by",
+        "for",
+        "from",
+        "in",
+        "of",
+        "on",
+        "to",
+        "with",
+        "who",
+        "-",
+        "\u2248",
+        "under"
+  );
+
+  /** Holds the original noun group */
+  protected String original;
+
+  /** Holds the adjective */
+  protected String adjective;
+
+  /** Holds the preposition */
+  protected String preposition;
+
+  /** Holds the noun group after the preposition */
+  protected NounGroup postModifier;
+
+  /** Holds the head of the noun group */
+  protected String head;
+
+  /** Holds the modifiers before the head  */
+  protected String preModifier;
+
+  /** Holds the determiner (if any) */
+  protected String determiner;
+
+  /** Returns the adjective. */
+  public String adjective() {
+    return adjective;
+  }
+
+  /**Returns the determiner. */
+  public String determiner() {
+    return determiner;
+  }
+
+  /** Returns the head (lowercased singular). */
+  public String head() {
+    return head;
+  }
+
+  /**Returns the original. */
+  public String original() {
+    return original;
+  }
+
+  /** Returns the postModifier. */
+  public NounGroup postModifier() {
+    return postModifier;
+  }
+
+  /** Returns the preModifier.  */
+  public String preModifier() {
+    return preModifier;
+  }
+
+  /** Returns the preposition.*/
+  public String preposition() {
+    return preposition;
+  }
+
+  /** Returns the full name with the head word stemmed */
+  public String stemmed() {
+    StringBuilder full=new StringBuilder();
+    if(preModifier!=null) full.append(preModifier).append(' ');
+    full.append(PlingStemmer.stem(head.toLowerCase()));
+    if(adjective!=null) full.append(' ').append(adjective);
+    if(preposition!=null) full.append(' ').append(preposition);
+    if(postModifier!=null) full.append(' ').append(postModifier.original());
+    return(full.toString());
+  }
+  
+  /** Stems the head. TRUE if this had any effect */
+  public boolean stemHead() {
+    String stemmed=PlingStemmer.stem(head);
+    boolean result=!stemmed.equals(head);
+    head=stemmed;
+    return(result);
+  }
+  /** Constructs a noun group from a String */
+  public NounGroup(String s) {
+    this(Arrays.asList(s.split("[\\s_]+")));
+  }  
+
+  /** Constructs a noun group from a list of words */
+  public NounGroup(List<String> words) { 
+    // Assemble the original
+    original=words.toString().replace(", ", " ");
+    original=original.substring(1,original.length()-1);
+    
+    // Cut away preceding determiners
+    if(words.size()>0 && determiners.contains(words.get(0).toLowerCase())) {
+      determiner=words.get(0).toLowerCase();
+      words=words.subList(1, words.size());
+    }
+    
+    // Locate prepositions (but not in first or last position)
+    int prepPos;
+    for(prepPos=1;prepPos<words.size()-1;prepPos++) {
+      if(prepositions.contains(words.get(prepPos))) {
+        preposition=words.get(prepPos);
+        break;
+      }
+    }
+    
+    // Locate "-ing"-adjectives before prepositions (but not at pos 0)
+    int ingPos;
+    for(ingPos=1;ingPos<prepPos;ingPos++) {
+      if(words.get(ingPos).endsWith("ing")) {
+        adjective=words.get(ingPos);
+        break;
+      }
+    }
+
+    // Cut off postmodifier in "Blubs blubbing in blah"    
+    if(preposition!=null && adjective!=null && ingPos==prepPos-1) {
+      postModifier=new NounGroup(words.subList(prepPos+1, words.size()));
+      words=words.subList(0, ingPos);
+    }
+    // Cut off postmodifier in "Blubs blubbing blah"
+    else if(adjective!=null) {
+      postModifier=new NounGroup(words.subList(ingPos+1, words.size()));
+      words=words.subList(0, ingPos);      
+    }
+    // Cut off postmodifier in "Blubs in blah"
+    else if(preposition!=null) {
+      postModifier=new NounGroup(words.subList(prepPos+1, words.size()));
+      if(prepPos>1 && isAdjective.apply(words.get(prepPos-1))) {
+        adjective=words.get(prepPos-1);        
+        words=words.subList(0, prepPos-1);      
+      } else {
+        words=words.subList(0, prepPos);      
+      }  
+    }
+
+    if(words.size()==0) return;
+
+    head=words.get(words.size()-1);
+    if(words.size()>1) {
+      preModifier=words.subList(0, words.size()-1).toString().replace(", ", "_");
+      preModifier=preModifier.substring(1, preModifier.length()-1);
+    }
+  }
+  
+
+  /** Checks if the originals match */
+  public boolean equals(Object o) {
+    return(o instanceof NounGroup && ((NounGroup)o).original.equals(original));
+  }
+
+  /** Returns the original */
+  public String toString() {
+    return(original);
+  }
+
+  /** Returns all fields in a String */
+  public String description() {
+    return("NounGroup:\n"+
+           "  Original: "+original+"\n"+
+           "  Stemmed: "+stemmed()+"\n"+
+           "  Determiner: "+determiner+"\n"+
+           "  preModifiers: "+preModifier+"\n"+
+           "  Head: "+head+"\n"+
+           "  Adjective: "+adjective+"\n"+
+           "  Preposition: "+preposition+"\n"+
+           "  postModifier: \n"+(postModifier==null?"":postModifier.description()));
+  }
+
+  /** Test method   */
+  public static void main(String[] args) throws Exception {
+    D.p("Enter a noun group and press ENTER. Press CTRL+C to abort");
+//    while(true) {
+//      D.p(new NounGroup(D.r()).description());
+//    }
+    
+    D.p(new NounGroup("Star_Trek_characters").description());
+  }
+
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java
new file mode 100644
index 0000000..277efc0
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java
@@ -0,0 +1,923 @@
+package org.yago.javatools.parsers;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.util.Map;
+import java.util.Set;
+
+import org.yago.javatools.datatypes.FinalMap;
+import org.yago.javatools.datatypes.FinalSet;
+
+/** 
+This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools).
+It is licensed under the Creative Commons Attribution License 
+(see http://creativecommons.org/licenses/by/3.0) by 
+the YAGO-NAGA team (see http://mpii.de/yago-naga).
+  
+
+  
+ 
+
+  The PlingStemmer stems an English noun (plural or singular) to its singular
+  form. It deals with "firemen"->"fireman", it knows Greek stuff like
+  "appendices"->"appendix" and yes, it was a lot of work to compile these exceptions.
+  Examples:
+  <PRE>
+      System.out.println(PlingStemmer.stem("boy"));
+      ----> boy
+      System.out.println(PlingStemmer.stem("boys"));
+      ----> boy
+      System.out.println(PlingStemmer.stem("biophysics"));
+      ---->  biophysics
+      System.out.println(PlingStemmer.stem("automata"));
+      ----> automaton
+      System.out.println(PlingStemmer.stem("genus"));
+      ----> genus
+      System.out.println(PlingStemmer.stem("emus"));
+      ----> emu
+  </PRE><P>
+
+  There are a number of word forms that can either be plural or singular.
+  Examples include "physics" (the science or the plural of "physic" (the
+  medicine)), "quarters" (the housing or the plural of "quarter" (1/4))
+  or "people" (the singular of "peoples" or the plural of "person"). In
+  these cases, the stemmer assumes the word is a plural form and returns
+  the singular form. The methods isPlural, isSingular and isPluralAndSingular
+  can be used to differentiate the cases.<P>
+
+  It cannot be guaranteed that the stemmer correctly stems a plural word
+  or correctly ignores a singular word -- let alone that it treats an
+  ambiguous word form in the way expected by the user.<P>
+  
+  The PlingStemmer uses material from <A HREF=http://wordnet.princeton.edu/>WordNet</A>.<P>
+  It requires the class FinalSet from the <A HREF=http://www.mpii.mpg.de/~suchanek/downloads/javatools>
+  Java Tools</A>.
+*/
+public class PlingStemmer {
+
+  /** Tells whether a word form is plural. This method just checks whether the
+   * stem method alters the word */
+  public static boolean isPlural(String s) {
+    return(!s.equals(stem(s)));
+  }
+
+  /** Tells whether a word form is singular. Note that a word can be both plural and singular */
+  public static boolean isSingular(String s) {
+    return(singAndPlur.contains(s.toLowerCase()) || !isPlural(s));
+  }  
+
+  /** Tells whether a word form is the singular form of one word and at
+   * the same time the plural form of another.*/
+  public static boolean isSingularAndPlural(String s) {
+    return(singAndPlur.contains(s.toLowerCase()));
+  }  
+  
+  /** Cuts a suffix from a string (that is the number of chars given by the suffix) */
+  public static String cut(String s, String suffix) {
+    return(s.substring(0,s.length()-suffix.length()));
+  }
+
+  /** Returns true if a word is probably not Latin */
+  public static boolean noLatin(String s) {
+    return(s.indexOf('h')>0 || s.indexOf('j')>0 || s.indexOf('k')>0 ||
+           s.indexOf('w')>0 || s.indexOf('y')>0 || s.indexOf('z')>0 ||
+           s.indexOf("ou")>0 || s.indexOf("sh")>0 || s.indexOf("ch")>0 ||
+           s.endsWith("aus"));
+  }
+
+  /** Returns true if a word is probably Greek */
+  private static boolean greek(String s) {
+    return(s.indexOf("ph")>0 || s.indexOf('y')>0 && s.endsWith("nges"));
+  }
+
+  /** Stems an English noun */
+  public static String stem(String s) {
+    String stem = s;
+
+     // Handle irregular ones
+     String irreg=irregular.get(s);
+     if(irreg!=null) return(stem=irreg);
+
+     // -on to -a
+     if(categoryON_A.contains(s)) return(stem=cut(s,"a")+"on");
+
+     // -um to -a
+     if(categoryUM_A.contains(s)) return(stem=cut(s,"a")+"um");
+
+     // -x to -ices
+     if(categoryIX_ICES.contains(s)) return(stem=cut(s,"ices")+"ix");
+
+     // -o to -i
+     if(categoryO_I.contains(s)) return(stem=cut(s,"i")+"o");
+
+     // -se to ses
+     if(categorySE_SES.contains(s)) return(stem=cut(s,"s"));
+
+     // -is to -es
+     if(categoryIS_ES.contains(s) || s.endsWith("theses")) return(stem=cut(s,"es")+"is");
+
+     // -us to -i
+     if(categoryUS_I.contains(s)) return(stem=cut(s,"i")+"us");
+     //Wrong plural
+     if(s.endsWith("uses") && (categoryUS_I.contains(cut(s,"uses")+"i") ||
+                               s.equals("genuses") || s.equals("corpuses"))) return(stem=cut(s,"es"));
+
+     // -ex to -ices
+     if(categoryEX_ICES.contains(s)) return(stem=cut(s,"ices")+"ex");
+
+     // Words that do not inflect in the plural
+     if(s.endsWith("ois") || s.endsWith("itis") || category00.contains(s) || categoryICS.contains(s)) return(stem=s);
+
+     // -en to -ina
+     // No other common words end in -ina
+     if(s.endsWith("ina")) return(stem=cut(s,"en"));
+
+     // -a to -ae
+     // No other common words end in -ae
+     if(s.endsWith("ae")) return(stem=cut(s,"e"));
+
+     // -a to -ata
+     // No other common words end in -ata
+     if(s.endsWith("ata")) return(stem=cut(s,"ta"));
+
+     // trix to -trices
+     // No common word ends with -trice(s)
+     if(s.endsWith("trices")) return(stem=cut(s,"trices")+"trix");
+
+     // -us to -us
+     //No other common word ends in -us, except for false plurals of French words
+     //Catch words that are not latin or known to end in -u
+     if(s.endsWith("us") && !s.endsWith("eaus") && !s.endsWith("ieus") && !noLatin(s)
+        && !categoryU_US.contains(s)) return(stem=s);
+
+     // -tooth to -teeth
+     // -goose to -geese
+     // -foot to -feet
+     // -zoon to -zoa
+     //No other common words end with the indicated suffixes
+     if(s.endsWith("teeth")) return(stem=cut(s,"teeth")+"tooth");
+     if(s.endsWith("geese")) return(stem=cut(s,"geese")+"goose");
+     if(s.endsWith("feet")) return(stem=cut(s,"feet")+"foot");
+     if(s.endsWith("zoa")) return(stem=cut(s,"zoa")+"zoon");
+
+     // -eau to -eaux
+     //No other common words end in eaux
+     if(s.endsWith("eaux")) return(stem=cut(s,"x"));
+
+     // -ieu to -ieux
+     //No other common words end in ieux
+     if(s.endsWith("ieux")) return(stem=cut(s,"x"));
+
+     // -nx to -nges
+     // Pay attention not to kill words ending in -nge with plural -nges
+     // Take only Greek words (works fine, only a handfull of exceptions)
+     if(s.endsWith("nges") && greek(s)) return(stem=cut(s,"nges")+"nx");
+
+     // -[sc]h to -[sc]hes
+     //No other common word ends with "shes", "ches" or "she(s)"
+     //Quite a lot end with "che(s)", filter them out
+     if(s.endsWith("shes") || s.endsWith("ches") && !categoryCHE_CHES.contains(s)) return(stem=cut(s,"es"));
+
+     // -ss to -sses
+     // No other common singular word ends with "sses"
+     // Filter out those ending in "sse(s)"
+     if(s.endsWith("sses") && !categorySSE_SSES.contains(s) && !s.endsWith("mousses")) return(stem=cut(s,"es"));
+
+     // -x to -xes
+     // No other common word ends with "xe(s)" except for "axe"
+     if(s.endsWith("xes") && !s.equals("axes")) return(stem=cut(s,"es"));
+
+     // -[nlw]ife to -[nlw]ives
+     //No other common word ends with "[nlw]ive(s)" except for olive
+     if(s.endsWith("nives") || s.endsWith("lives") && !s.endsWith("olives") ||
+        s.endsWith("wives")) return(stem=cut(s,"ves")+"fe");
+
+     // -[aeo]lf to -ves  exceptions: valve, solve
+     // -[^d]eaf to -ves  exceptions: heave, weave
+     // -arf to -ves      no exception
+     if(s.endsWith("alves") && !s.endsWith("valves") ||
+        s.endsWith("olves") && !s.endsWith("solves") ||
+        s.endsWith("eaves") && !s.endsWith("heaves") && !s.endsWith("weaves") ||
+        s.endsWith("arves") ) return(stem=cut(s,"ves")+"f");
+
+     // -y to -ies
+     // -ies is very uncommon as a singular suffix
+     // but -ie is quite common, filter them out
+     if(s.endsWith("ies") && !categoryIE_IES.contains(s)) return(stem=cut(s,"ies")+"y");
+
+     // -o to -oes
+     // Some words end with -oe, so don't kill the "e"
+     if(s.endsWith("oes") && !categoryOE_OES.contains(s)) return(stem=cut(s,"es"));
+
+     // -s to -ses
+     // -z to -zes
+     // no words end with "-ses" or "-zes" in singular
+     if(s.endsWith("ses") || s.endsWith("zes") ) return(stem=cut(s,"es"));
+
+     // - to -s
+     if(s.endsWith("s") && !s.endsWith("ss") && !s.endsWith("is")) return(stem=cut(s,"s"));
+
+     return stem;
+  }
+
+  /** Words that end in "-se" in their plural forms (like "nurse" etc.)*/
+  public static Set<String> categorySE_SES=new FinalSet<String>(
+   "nurses",
+   "cruises",
+   "premises",
+   "houses"
+  );
+
+  /** Words that do not have a distinct plural form (like "atlas" etc.)*/
+  public static Set<String> category00=new FinalSet<String>(
+   "alias",
+   "asbestos",
+   "atlas",
+   "barracks",
+   "bathos",
+   "bias",
+   "breeches",
+   "britches",
+   "canvas",
+   "chaos",
+   "clippers",
+   "contretemps",
+   "corps",
+   "cosmos",
+   "crossroads",
+   "diabetes",
+   "ethos",
+   "gallows",
+   "gas",
+   "graffiti",
+   "headquarters",
+   "herpes",
+   "high-jinks",
+   "innings",
+   "jackanapes",
+   "lens",
+   "means",
+   "measles",
+   "mews",
+   "mumps",
+   "news",
+   "pathos",
+   "pincers",
+   "pliers",   
+   "proceedings",
+   "rabies",
+   "rhinoceros",
+   "sassafras",
+   "scissors",
+   "series",
+   "shears",
+   "species",
+   "tuna"
+  );
+
+  /** Words that change from "-um" to "-a" (like "curriculum" etc.), listed in their plural forms*/
+  public static Set<String> categoryUM_A=new FinalSet<String>(
+    "addenda",
+    "agenda",
+    "aquaria",
+    "bacteria",
+    "candelabra",
+    "compendia",
+    "consortia",
+    "crania",
+    "curricula",
+    "data",
+    "desiderata",
+    "dicta",
+    "emporia",
+    "enconia",
+    "errata",
+    "extrema",
+    "gymnasia",
+    "honoraria",
+    "interregna",
+    "lustra",
+    "maxima",
+    "media",
+    "memoranda",
+    "millenia",
+    "minima",
+    "momenta",
+    "optima",
+    "ova",
+    "phyla",
+    "quanta",
+    "rostra",
+    "spectra",
+    "specula",
+    "stadia",
+    "strata",
+    "symposia",
+    "trapezia",
+    "ultimata",
+    "vacua",
+    "vela"
+  );
+
+  /** Words that change from "-on" to "-a" (like "phenomenon" etc.), listed in their plural forms*/
+  public static Set<String> categoryON_A=new FinalSet<String>(
+    "aphelia",
+    "asyndeta",
+    "automata",
+    "criteria",
+    "hyperbata",
+    "noumena",
+    "organa",
+    "perihelia",
+    "phenomena",
+    "prolegomena"
+  );
+
+  /** Words that change from "-o" to "-i" (like "libretto" etc.), listed in their plural forms*/
+  public static Set<String> categoryO_I=new FinalSet<String>(
+   "alti",
+   "bassi",
+   "canti",
+   "contralti",
+   "crescendi",
+   "libretti",
+   "soli",
+   "soprani",
+   "tempi",
+   "virtuosi"
+  );
+
+  /** Words that change from "-us" to "-i" (like "fungus" etc.), listed in their plural forms*/
+  public static Set<String> categoryUS_I=new FinalSet<String>(
+   "alumni",
+   "bacilli",
+   "cacti",
+   "foci",
+   "fungi",
+   "genii",
+   "hippopotami",
+   "incubi",
+   "nimbi",
+   "nuclei",
+   "nucleoli",
+   "octopi",
+   "radii",
+   "stimuli",
+   "styli",
+   "succubi",
+   "syllabi",
+   "termini",
+   "tori",
+   "umbilici",
+   "uteri"
+  );
+
+  /** Words that change from "-ix" to "-ices" (like "appendix" etc.), listed in their plural forms*/
+  public static Set<String> categoryIX_ICES=new FinalSet<String>(
+    "appendices",
+    "cervices"
+  );
+
+  /** Words that change from "-is" to "-es" (like "axis" etc.), listed in their plural forms*/
+  public static Set<String> categoryIS_ES=new FinalSet<String>(
+    // plus everybody ending in theses
+    "analyses",
+    "axes",
+    "bases",
+    "crises",
+    "diagnoses",
+    "ellipses",
+    "emphases",
+    "neuroses",
+    "oases",
+    "paralyses",
+    "synopses"
+  );
+
+  /** Words that change from "-oe" to "-oes" (like "toe" etc.), listed in their plural forms*/
+  public static Set<String> categoryOE_OES=new FinalSet<String>(
+    "aloes",
+    "backhoes",
+    "beroes",
+    "canoes",
+    "chigoes",
+    "cohoes",
+    "does",
+    "felloes",
+    "floes",
+    "foes",
+    "gumshoes",
+    "hammertoes",
+    "hoes",
+    "hoopoes",
+    "horseshoes",
+    "leucothoes",
+    "mahoes",
+    "mistletoes",
+    "oboes",
+    "overshoes",
+    "pahoehoes",
+    "pekoes",
+    "roes",
+    "shoes",
+    "sloes",
+    "snowshoes",
+    "throes",
+    "tic-tac-toes",
+    "tick-tack-toes",
+    "ticktacktoes",
+    "tiptoes",
+    "tit-tat-toes",
+    "toes",
+    "toetoes",
+    "tuckahoes",
+    "woes"
+  );
+
+  /** Words that change from "-ex" to "-ices" (like "index" etc.), listed in their plural forms*/
+  public static Set<String> categoryEX_ICES=new FinalSet<String>(
+    "apices",
+    "codices",
+    "cortices",
+    "indices",
+    "latices",
+    "murices",
+    "pontifices",
+    "silices",
+    "simplices",
+    "vertices",
+    "vortices"
+  );
+
+  /** Words that change from "-u" to "-us" (like "emu" etc.), listed in their plural forms*/
+  public static Set<String> categoryU_US=new FinalSet<String>(
+   "apercus",
+   "barbus",
+   "cornus",
+   "ecrus",
+   "emus",
+   "fondus",
+   "gnus",
+   "iglus",
+   "mus",
+   "nandus",
+   "napus",
+   "poilus",
+   "quipus",
+   "snafus",
+   "tabus",
+   "tamandus",
+   "tatus",
+   "timucus",
+   "tiramisus",
+   "tofus",
+   "tutus"
+  );
+
+  /** Words that change from "-sse" to "-sses" (like "finesse" etc.), listed in their plural forms*/
+  public static Set<String> categorySSE_SSES=new FinalSet<String>(
+    //plus those ending in mousse
+    "bouillabaisses",
+    "coulisses",
+    "crevasses",
+    "crosses",
+    "cuisses",
+    "demitasses",
+    "ecrevisses",
+    "fesses",
+    "finesses",
+    "fosses",
+    "impasses",
+    "lacrosses",
+    "largesses",
+    "masses",
+    "noblesses",
+    "palliasses",
+    "pelisses",
+    "politesses",
+    "posses",
+    "tasses",
+    "wrasses"
+  );
+
+  /** Words that change from "-che" to "-ches" (like "brioche" etc.), listed in their plural forms*/
+  public static Set<String> categoryCHE_CHES=new FinalSet<String>(
+    "adrenarches",
+    "attaches",
+    "avalanches",
+    "barouches",
+    "brioches",
+    "caches",
+    "caleches",
+    "caroches",
+    "cartouches",
+    "cliches",
+    "cloches",
+    "creches",
+    "demarches",
+    "douches",
+    "gouaches",
+    "guilloches",
+    "headaches",
+    "heartaches",
+    "huaraches",
+    "menarches",
+    "microfiches",
+    "moustaches",
+    "mustaches",
+    "niches",
+    "panaches",
+    "panoches",
+    "pastiches",
+    "penuches",
+    "pinches",
+    "postiches",
+    "psyches",
+    "quiches",
+    "schottisches",
+    "seiches",
+    "soutaches",
+    "synecdoches",
+    "thelarches",
+    "troches"
+  );
+
+  /** Words that end with "-ics" and do not exist as nouns without the 's' (like "aerobics" etc.)*/
+  public static Set<String> categoryICS=new FinalSet<String>(
+    "aerobatics",
+    "aerobics",
+    "aerodynamics",
+    "aeromechanics",
+    "aeronautics",
+    "alphanumerics",
+    "animatronics",
+    "apologetics",
+    "architectonics",
+    "astrodynamics",
+    "astronautics",
+    "astrophysics",
+    "athletics",
+    "atmospherics",
+    "autogenics",
+    "avionics",
+    "ballistics",
+    "bibliotics",
+    "bioethics",
+    "biometrics",
+    "bionics",
+    "bionomics",
+    "biophysics",
+    "biosystematics",
+    "cacogenics",
+    "calisthenics",
+    "callisthenics",
+    "catoptrics",
+    "civics",
+    "cladistics",
+    "cryogenics",
+    "cryonics",
+    "cryptanalytics",
+    "cybernetics",
+    "cytoarchitectonics",
+    "cytogenetics",
+    "diagnostics",
+    "dietetics",
+    "dramatics",
+    "dysgenics",
+    "econometrics",
+    "economics",
+    "electromagnetics",
+    "electronics",
+    "electrostatics",
+    "endodontics",
+    "enterics",
+    "ergonomics",
+    "eugenics",
+    "eurhythmics",
+    "eurythmics",
+    "exodontics",
+    "fibreoptics",
+    "futuristics",
+    "genetics",
+    "genomics",
+    "geographics",
+    "geophysics",
+    "geopolitics",
+    "geriatrics",
+    "glyptics",
+    "graphics",
+    "gymnastics",
+    "hermeneutics",
+    "histrionics",
+    "homiletics",
+    "hydraulics",
+    "hydrodynamics",
+    "hydrokinetics",
+    "hydroponics",
+    "hydrostatics",
+    "hygienics",
+    "informatics",
+    "kinematics",
+    "kinesthetics",
+    "kinetics",
+    "lexicostatistics",
+    "linguistics",
+    "lithoglyptics",
+    "liturgics",
+    "logistics",
+    "macrobiotics",
+    "macroeconomics",
+    "magnetics",
+    "magnetohydrodynamics",
+    "mathematics",
+    "metamathematics",
+    "metaphysics",
+    "microeconomics",
+    "microelectronics",
+    "mnemonics",
+    "morphophonemics",
+    "neuroethics",
+    "neurolinguistics",
+    "nucleonics",
+    "numismatics",
+    "obstetrics",
+    "onomastics",
+    "orthodontics",
+    "orthopaedics",
+    "orthopedics",
+    "orthoptics",
+    "paediatrics",
+    "patristics",
+    "patristics",
+    "pedagogics",
+    "pediatrics",
+    "periodontics",
+    "pharmaceutics",
+    "pharmacogenetics",
+    "pharmacokinetics",
+    "phonemics",
+    "phonetics",
+    "phonics",
+    "photomechanics",
+    "physiatrics",
+    "pneumatics",
+    "poetics",
+    "politics",
+    "pragmatics",
+    "prosthetics",
+    "prosthodontics",
+    "proteomics",
+    "proxemics",
+    "psycholinguistics",
+    "psychometrics",
+    "psychonomics",
+    "psychophysics",
+    "psychotherapeutics",
+    "robotics",
+    "semantics",
+    "semiotics",
+    "semitropics",
+    "sociolinguistics",
+    "stemmatics",
+    "strategics",
+    "subtropics",
+    "systematics",
+    "tectonics",
+    "telerobotics",
+    "therapeutics",
+    "thermionics",
+    "thermodynamics",
+    "thermostatics"
+  );
+
+  /** Words that change from "-ie" to "-ies" (like "auntie" etc.), listed in their plural forms*/
+  public static Set<String> categoryIE_IES=new FinalSet<String>(
+    "aeries",
+    "anomies",
+    "aunties",
+    "baddies",
+    "beanies",
+    "birdies",
+    "boccies",
+    "bogies",
+    "bolshies",
+    "bombies",
+    "bonhomies",
+    "bonxies",
+    "booboisies",
+    "boogies",
+    "boogie-woogies",
+    "bookies",
+    "booties",
+    "bosies",
+    "bourgeoisies",
+    "brasseries",
+    "brassies",
+    "brownies",
+    "budgies",
+    "byrnies",
+    "caddies",
+    "calories",
+    "camaraderies",
+    "capercaillies",
+    "capercailzies",
+    "cassies",
+    "catties",
+    "causeries",
+    "charcuteries",
+    "chinoiseries",
+    "collies",
+    "commies",
+    "cookies",
+    "coolies",
+    "coonties",
+    "cooties",
+    "corries",
+    "coteries",
+    "cowpies",
+    "cowries",
+    "cozies",
+    "crappies",
+    "crossties",
+    "curies",
+    "dachsies",
+    "darkies",
+    "dassies",
+    "dearies",
+    "dickies",
+    "dies",
+    "dixies",
+    "doggies",
+    "dogies",
+    "dominies",
+    "dovekies",
+    "eyries",
+    "faeries",
+    "falsies",
+    "floozies",
+    "folies",
+    "foodies",
+    "freebies",
+    "gaucheries",
+    "gendarmeries",
+    "genies",
+    "ghillies",
+    "gillies",
+    "goalies",
+    "goonies",
+    "grannies",
+    "grotesqueries",
+    "groupies",
+    "hankies",
+    "hippies",
+    "hoagies",
+    "honkies",
+    "hymies",
+    "indies",
+    "junkies",
+    "kelpies",
+    "kilocalories",
+    "knobkerries",
+    "koppies",
+    "kylies",
+    "laddies",
+    "lassies",
+    "lies",
+    "lingeries",
+    "magpies",
+    "magpies",
+    "marqueteries",
+    "mashies",
+    "mealies",
+    "meanies",
+    "menageries",
+    "millicuries",
+    "mollies",
+    "facts1",
+    "moxies",
+    "neckties",
+    "newbies",
+    "nighties",
+    "nookies",
+    "oldies",
+    "organdies",
+    "panties",
+    "parqueteries",
+    "passementeries",
+    "patisseries",
+    "pies",
+    "pinkies",
+    "pixies",
+    "porkpies",
+    "potpies",
+    "prairies",
+    "preemies",
+    "premies",
+    "punkies",
+    "pyxies",
+    "quickies",
+    "ramies",
+    "reveries",
+    "rookies",
+    "rotisseries",
+    "scrapies",
+    "sharpies",
+    "smoothies",
+    "softies",
+    "stoolies",
+    "stymies",
+    "swaggies",
+    "sweeties",
+    "talkies",
+    "techies",
+    "ties",
+    "tooshies",
+    "toughies",
+    "townies",
+    "veggies",
+    "walkie-talkies",
+    "wedgies",
+    "weenies",
+    "weirdies",
+    "yardies",
+    "yuppies",
+    "zombies"
+  );
+
+  /** Maps irregular Germanic English plural nouns to their singular form */
+  public static Map<String,String> irregular=new FinalMap<String,String>(
+    "beefs","beef",
+    "beeves","beef",
+    "brethren","brother",
+    "busses","bus",
+    "cattle","cattlebeast",
+    "children","child",
+    "corpora","corpus",
+    "ephemerides","ephemeris",
+    "firemen","fireman",
+    "genera","genus",
+    "genies","genie",
+    "genii","genie",
+    "kine","cow",
+    "lice","louse",
+    "men","man",
+    "mice","mouse",
+    "mongooses","mongoose",
+    "monies","money",
+    "mythoi","mythos",
+    "octopodes","octopus",
+    "octopuses","octopus",
+    "oxen","ox",
+    "people","person",
+    "soliloquies","soliloquy",
+    "throes","throes",
+    "trilbys","trilby",
+    "women","woman"
+  );
+
+  /** Contains word forms that can either be plural or singular */
+  public static Set<String> singAndPlur=new FinalSet<String>(
+        "acoustics",
+        "aestetics",
+        "aquatics",
+        "basics",
+        "ceramics",
+        "classics",
+        "cosmetics",
+        "dermatoglyphics",
+        "dialectics",
+        "dynamics",
+        "esthetics",
+        "ethics",
+        "harmonics",
+        "heroics",
+        "isometrics",
+        "mechanics",
+        "metrics",
+        "statistics",
+        "optic",
+        "people",
+        "physics",
+        "polemics",
+        "premises",
+        "propaedeutics",
+        "pyrotechnics",
+        "quadratics",
+        "quarters",
+        "statistics",
+        "tactics",
+        "tropics"
+        );
+  
+  /** Test routine */
+  public static void main(String[] argv) throws Exception {    
+    System.out.println("Enter an English word in plural form and press ENTER");
+    BufferedReader in=new BufferedReader(new InputStreamReader(System.in));
+    while(true) {
+      String w=in.readLine();
+      if(w.length()==0) break;
+      if(isPlural(w)) System.out.println("This word is plural");
+      if(isSingular(w)) System.out.println("This word is singular");
+      System.out.println("Stemmed to singular: "+stem(w));
+    }
+  }
+}
diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java
new file mode 100644
index 0000000..4c07240
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java
@@ -0,0 +1,288 @@
+package org.yago.javatools.parsers;
+
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.SortedMap;
+import java.util.TreeMap;
+
+/**
+ * This class is part of the Java Tools (see
+ * http://mpii.de/yago-naga/javatools). It is licensed under the Creative
+ * Commons Attribution License (see http://creativecommons.org/licenses/by/3.0)
+ * by the YAGO-NAGA team (see http://mpii.de/yago-naga).
+ * 
+ * This class implements position change trackers that keep track of position
+ * changes within a String, e.g. caused through normalization etc.
+ * This allows for instance, given a position int the normalized string
+ * to get the corresponding position in the original non-normalized string 
+ * 
+ *
+ * 
+ * backward position tracker - 
+ * tracking several replacement/text changes allowing to trace a position in the modified 
+ * text back to the corresp. position in the original text 
+ * for the other direction see ForwardPositionTracker 
+ * 
+ * @author smetzger */
+public class PositionTracker {
+	
+	
+	private SortedMap<Integer,Integer>positionMap;
+	private SortedMap<Integer,Integer>positionChanges;
+	private SortedMap<Integer,Integer>old2NewMap;
+	private int accumulatedModifier=0;
+	  
+	public PositionTracker(){
+		positionMap=new TreeMap<Integer,Integer>();
+		positionChanges=new TreeMap<Integer,Integer>();
+		old2NewMap=new TreeMap<Integer,Integer>();		     
+	}
+	
+	  
+	public void addPositionChange(int pos, int modifier){
+	      if(modifier!=0){	    	  
+          int oldModifier=0;         
+	    	  old2NewMap.put(pos, modifier);
+          accumulatedModifier+=modifier;
+          if(positionChanges.containsKey(pos+accumulatedModifier))
+            oldModifier=positionChanges.get(pos+accumulatedModifier);
+	    	  positionChanges.put(pos+accumulatedModifier,modifier*-1+oldModifier);	    	  
+	      }		  
+	}
+
+	  
+	  
+	/** Closes the current changing run by Merging new position changes into the existing position change map
+   *  after each round (one round=consecutive changes along the text) you need to call closeRun() before submitting more position changes from a new round,
+   *  i.e. whenever you passed the string to be modified once call closeRun() before starting to run over the string again with more replacements
+	 * Do this every time you ran once over the text making changes to be tracked*/ 
+	public void closeRun() {
+		if(positionChanges.isEmpty())
+		  return;
+	  		
+		  
+		SortedMap<Integer,Integer> temp=positionChanges;
+		 
+		//adapt old positions to new mapping
+		while(!positionMap.isEmpty()){
+		  Integer key=positionMap.firstKey();
+	  	  Collection<Integer> modifiers=old2NewMap.headMap(key+1).values();
+	  	  Integer newposition=key;    	
+	  	  for(Iterator<Integer> it=modifiers.iterator(); it.hasNext(); newposition+=it.next()){}
+	  	  Integer value=positionMap.get(key);
+	  	  if(positionChanges.containsKey(newposition))
+	  		  value+=positionChanges.get(newposition);
+	  	  positionChanges.put(newposition, value);
+	  	  positionMap.remove(key);
+		}
+
+		positionChanges=positionMap;
+		positionMap=temp;
+		old2NewMap.clear();
+		accumulatedModifier=0;
+		return;
+	}
+	  
+	  
+	  
+	  
+	/** Merges new position changes (given with the inversed old2new mapping) into the existing position change map*/ 
+/*	private void addPositionMappings(SortedMap<Integer,Integer> newPosChanges, 
+	   		SortedMap<Integer,Integer> old2NewMap) {
+	  
+		
+		TreeMap<Integer,Integer> newMap=new TreeMap<Integer,Integer>();
+		 
+		//adapt old positions to new mapping
+		while(!positionMap.isEmpty()){
+		  Integer key=positionMap.firstKey();
+	  	  Collection<Integer> modifiers=old2NewMap.headMap(key+1).values();
+	  	  Integer newposition=key;    	
+	  	  for(Iterator<Integer> it=modifiers.iterator(); it.hasNext(); newposition+=it.next()){}
+	  	  Integer value=positionMap.get(key);
+	  	  if(newMap.containsKey(newposition))
+	  		  value+=newMap.get(newposition);
+	  	  newMap.put(newposition, value);
+	  	  positionMap.remove(key);
+		}
+		while(!newPosChanges.isEmpty()){
+		  Integer key=newPosChanges.firstKey();
+		  Integer value=newPosChanges.get(key);
+		  if(newMap.containsKey(key))
+		     value+=newMap.get(key);
+		  newMap.put(key, value);
+		  newPosChanges.remove(key);						
+		}
+		positionMap=newMap;
+		old2NewMap.clear();
+		return;
+	}
+	  */
+	  
+	public Integer translatePosition(Integer pos) {
+		SortedMap<Integer,Integer> headMap=positionMap.headMap(pos+1);
+		  Integer modifier=0;    	
+		  for(Iterator<Integer> it=headMap.values().iterator(); it.hasNext(); modifier+=it.next()){}
+/*		  if(headMap.size()>1){      TODO: Possible Optimization if we assume positions are asked in ascending order
+			  headMap.clear();
+			  posMap.put(pos, modifier);
+		  }*/ 
+		  return pos+modifier;		  
+	}
+  
+
+    
+
+  
+  
+  
+  
+  
+  
+  
+  /** forward position change tracking - keeping track of several rounds of text modifications allowing to trace a position in the original
+   *  text along the modifications to the corresp. position in the modified text 
+   *  after each round (one round=consecutive changes along the text) you need to call closeRun() before submitting more position changes from a new round,
+   *  i.e. whenever you passed the string to be modified once call closeRun() before starting to run over the string again with more replacements
+   *  REMARK: NOT TESTED WITH MORE THAN ONE ROUND! may be ERRORNOUS with multiple rounds -> use with care (works with a single round though)
+   * @author smetzger
+   *
+   */
+  public static class ForwardPositionTracker {
+    
+    
+    private SortedMap<Integer,Integer>positionMap;
+    private SortedMap<Integer,Integer>positionChanges;
+    //private SortedMap<Integer,Integer>new2OldMap;
+    private PositionTracker new2OldTracker=null;
+    private int accumulatedModifier=0;
+      
+    public ForwardPositionTracker(){
+      positionMap=new TreeMap<Integer,Integer>();
+      positionChanges=new TreeMap<Integer,Integer>();
+     // new2OldMap=new TreeMap<Integer,Integer>();
+      new2OldTracker=new PositionTracker();
+     
+    }
+    
+      
+    public void addPositionChange(int pos, int modifier){
+          if(modifier!=0){                                         
+            positionChanges.put(pos,modifier);
+            accumulatedModifier+=modifier;
+            /*if(new2OldMap.containsKey(pos+accumulatedModifier))
+              oldModifier=new2OldMap.get(pos+accumulatedModifier);
+            new2OldMap.put(pos+accumulatedModifier, -1*modifier+oldModifier);                        
+          }     */
+            new2OldTracker.addPositionChange(pos, modifier);
+          }
+    }
+    
+
+
+
+
+      
+      
+    /** Closes the current changing run by Merging new position changes into the existing position change map
+     * Do this every time you ran once over the text making changes to be tracked*/ 
+    public void closeRun() {
+      if(positionChanges.isEmpty())
+        return;
+
+      
+      for(Map.Entry<Integer, Integer> change:positionChanges.entrySet()){
+        Integer positionInOrigStream=new2OldTracker.translatePosition(change.getKey());
+        if(positionMap.containsKey(positionInOrigStream))
+          positionMap.put(positionInOrigStream, change.getValue()+positionMap.get(positionInOrigStream));
+        else
+          positionMap.put(positionInOrigStream, change.getValue());
+      }
+       
+      positionChanges.clear();
+      accumulatedModifier=0;
+      new2OldTracker.closeRun();
+        
+      return;
+    }
+      
+    
+      
+    /** tells whether a position in the original stream has been cut away by some change operation, 
+     *  such that translating it usually would make not to much sense
+     *  @return true, iff the given position has been cut away, false otherwise (i.e. false if it should be mappable)
+     *  TODO: current version ONLY WORKS SECURELY WHEN THERE IS ONLY ONE POSITION CHANGE RUN WITHOUT OVERLAPPING CHANGES!
+     *  as soon as there are more than one change runs, or changes that overlap, we would need to check all following changes instead of only the next one */
+    public boolean hasBeenCutAway(Integer pos){
+        SortedMap<Integer,Integer> tailMap=positionMap.tailMap(pos+1);
+        if(tailMap.isEmpty())
+          return false;
+        Integer key=tailMap.firstKey();
+        Integer modifier=tailMap.get(key);
+        if(modifier<0 && key+modifier<=pos )
+          return true;
+        else 
+          return false;
+        /* this does not work for the general case (had it the wrong way aroung), but can be used to implement it
+        Integer key=null;
+        Iterator<Integer> it=tailMap.keySet().iterator();
+        while(it.hasNext()){
+          key=it.next();
+          Integer mod=tailMap.get(key);
+          if(mod<0 && key-mod>=pos)
+            return true;
+        }  
+        return false;*/
+      }
+      
+    public Integer translatePosition(Integer pos) {
+      SortedMap<Integer,Integer> headMap=positionMap.headMap(pos+1);
+        Integer modifier=0;     
+        for(Iterator<Integer> it=headMap.values().iterator(); it.hasNext(); modifier+=it.next()){}
+  /*      if(headMap.size()>1){      Optimization if we assume positions are asked in ascending order
+          headMap.clear();
+          posMap.put(pos, modifier);
+        }*/ 
+        return pos+modifier;      
+    }
+    
+    /** also handles positions inside text parts that have been cut out properly 
+     * 
+     *  TODO: current version ONLY WORKS SECURELY WHEN THERE IS ONLY ONE POSITION CHANGE RUN WITHOUT OVERLAPPING CHANGES!
+     *  as soon as there are more than one change runs, or changes that overlap, we would need to check all following changes instead of only the next one */
+    public Integer translatePositionExactly(Integer pos) {
+
+      SortedMap<Integer,Integer> tailMap=positionMap.tailMap(pos+1);
+      if(tailMap.isEmpty())
+        return translatePosition(pos);
+      else{
+        Integer key=tailMap.firstKey();
+        Integer modifier=tailMap.get(key);
+        return translatePosition(Math.min(pos,key+modifier));
+      }
+      
+/*        
+ *        That version does it the wrong way around
+ *        SortedMap<Integer,Integer> headMap=positionMap.headMap(pos+1);
+          Integer modifier=0;     
+          Integer key=null, value=null;
+          Iterator<Integer> it=headMap.keySet().iterator();
+          while(it.hasNext()){
+            key=it.next();
+        	  value=headMap.get(key);
+        	  if(value<0)
+        		  modifier+=Math.max(key-pos, value);
+          }*/
+    /*      if(headMap.size()>1){      Optimization if we assume positions are asked in ascending order
+            headMap.clear();
+            posMap.put(pos, modifier);
+          }
+          return pos+modifier;    */ 
+            
+      }
+
+  }
+  
+  
+}
diff --git a/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java b/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java
new file mode 100644
index 0000000..02a30d8
--- /dev/null
+++ b/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java
@@ -0,0 +1,38 @@
+package org.karsha.wikipediacategoryprocessor;
+
+import junit.framework.Test;
+import junit.framework.TestCase;
+import junit.framework.TestSuite;
+
+/**
+ * Unit test for simple App.
+ */
+public class AppTest 
+    extends TestCase
+{
+    /**
+     * Create the test case
+     *
+     * @param testName name of the test case
+     */
+    public AppTest( String testName )
+    {
+        super( testName );
+    }
+
+    /**
+     * @return the suite of tests being tested
+     */
+    public static Test suite()
+    {
+        return new TestSuite( AppTest.class );
+    }
+
+    /**
+     * Rigourous Test :-)
+     */
+    public void testApp()
+    {
+        assertTrue( true );
+    }
+}