diff --git a/WikipediaCategoryProcessor/.gitignore b/WikipediaCategoryProcessor/.gitignore new file mode 100644 index 0000000..5168a9a --- /dev/null +++ b/WikipediaCategoryProcessor/.gitignore @@ -0,0 +1,4 @@ +/target/ +/nbactions.xml +/nbactions-release-profile.xml + diff --git a/WikipediaCategoryProcessor/pom.xml b/WikipediaCategoryProcessor/pom.xml new file mode 100644 index 0000000..f85ebb9 --- /dev/null +++ b/WikipediaCategoryProcessor/pom.xml @@ -0,0 +1,69 @@ + + + 4.0.0 + + org.karsha + WikipediaCategoryProcessor + 1.0-SNAPSHOT + jar + + WikipediaCategoryProcessor + http://maven.apache.org + + + UTF-8 + + + + + junit + junit + 3.8.1 + test + + + mysql + mysql-connector-java + 5.1.25 + + + org.apache.lucene + lucene-core + 4.3.1 + + + com.jayway.jsonpath + json-path + 0.8.1 + + + + org.apache.clerezza.ext + org.json.simple + 0.3-incubating + + + com.google.api-client + google-api-client + 1.16.0-rc + + + + org.apache.lucene + lucene-queries + 4.3.1 + + + org.apache.lucene + lucene-queryparser + 4.3.1 + + + org.apache.lucene + lucene-analyzers-common + 4.3.1 + + + + diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Category.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Category.java new file mode 100644 index 0000000..8d5442c --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Category.java @@ -0,0 +1,25 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * + * Date Author Changes + * Jul 20, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.categoryprocessor; + + + +/** + * TODO- describe the purpose of the class + * + */ +public class Category { + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java new file mode 100644 index 0000000..cf51917 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryDB.java @@ -0,0 +1,286 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * Date Author Changes Jul 6, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.categoryprocessor; + + +import java.io.*; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** + * TODO- describe the purpose of the class + * + */ +public class CategoryDB +{ + + + + public static int getCategoryPageCount( int threshold ) + { + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "SELECT COUNT(*) FROM `page_category` WHERE `cat_subcats`=0 AND `cat_pages`< ? "; + + + try + { + ps = connection.prepareStatement( query ); + ps.setInt( 1, threshold ); + + rs = ps.executeQuery(); + int nodeId = 0; + while ( rs.next() ) + { + nodeId = rs.getInt( 1 ); + } + return nodeId; + } catch ( SQLException e ) + { + e.printStackTrace(); + return 0; + } + + } + + public static void getCategoryByName(String line) throws IOException + { + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + FileWriter outFile; + + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + String temp = null; + + + + // System.out.println(line); + // System.out.println(temp); + + String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` LIKE ? "; +//String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` = ? "; +//String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` ="+catTitle; + + + try + { + ps = connection.prepareStatement( query ); + // ps.setString( 1, temp ); + ps.setString( 1, line ); + rs = ps.executeQuery(); + int count = 0; + + if ( rs.next() ) + { + do + { + //outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\category_match_article_pages.txt", true ); + //outFile.append( rs.getString( "cat_id" ) + "\t" + rs.getString( "cat_title" ) + "\t" + rs.getString( "cat_pages" ) + "\t" + rs.getString( "cat_subcats" ) + "\t" + rs.getString( "cat_files" ) + "\t" + rs.getString( "cat_hidden" ) + "\n" ); + // outFile.close(); + insertCategory( rs.getInt( "cat_id"), rs.getString( "cat_title" ), rs.getInt( "cat_pages"), rs.getInt( "cat_subcats"), rs.getInt( "cat_files"), rs.getBoolean( "cat_hidden" ) ); + count++; + if(count>1){ + System.out.println( count+" count is over one " + line); + } + } while ( rs.next() ); + } else + { + + outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_not_found_in_category_table_2.txt", true ); + outFile.append( line+ "\n" ); + outFile.close(); + + //System.out.println( line ); + // No data + } + + + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + + + + } + + + public static void getCategoryDirectedByArticlePage(String line) throws IOException + { + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + + + String lineArr[]; + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + lineArr=line.split("\t"); + + // System.out.println(line); + // System.out.println(temp); + String query = "SELECT cl_from, cl_to, cl_type FROM `categorylinks` WHERE `cl_from` =" + lineArr[0].trim() ; + + + try + { + ps = connection.prepareStatement( query ); + // ps.setString( 1, temp ); + //ps.setString( 1, catTitle ); + rs = ps.executeQuery(); + int count = 0; + + if ( rs.next() ) + { + do + { + FileWriter outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_match_article_pages_v1.txt", true ); + outFile.append( rs.getInt( "cl_from" ) + "\t" + rs.getString( "cl_to" ) + "\t" + rs.getString( "cl_type" ) + "\n" ); + outFile.close(); + count++; + } while ( rs.next() ); + } else + { + + FileWriter outFileCatNotFound = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_not_found_article_pages_v1.txt", true ); + outFileCatNotFound.append( line + "\n" ); + outFileCatNotFound.close(); + + //System.out.println( line +"\t no category found"); + // No data + } + + + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + //} + //} + + + + } + + public static void getCategoryLinkByCatName(String line) throws IOException + { + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + + + // String lineArr[]; + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + // lineArr=line.split("\t"); + + // System.out.println(line); + // System.out.println(temp); + String query = "SELECT cl_from FROM `categorylinks` WHERE `cl_to` LIKE " + line.trim() ; + + + try + { + ps = connection.prepareStatement( query ); + // ps.setString( 1, temp ); + //ps.setString( 1, catTitle ); + rs = ps.executeQuery(); + int count = 0; + + if ( rs.next() ) + { + do + { + + //if caegory does not have + if(!PageDB.isArticlePage( rs.getInt("cl_from") )){ + + } +// FileWriter outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_match_article_pages_v1.txt", true ); +// outFile.append( rs.getInt( "cl_from" ) + "\t" + rs.getString( "cl_to" ) + "\t" + rs.getString( "cl_type" ) + "\n" ); +// outFile.close(); +// count++; + } while ( rs.next() ); + } + + + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + //} + //} + + + + } + + public static void insertCategory( int cat_id,String cat_title, int cat_pages,int cat_subcats,int cat_files,boolean cat_hidden) + { + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + /* + * `cat_id` int(10) unsigned NOT NULL AUTO_INCREMENT, + `cat_title` varbinary(255) NOT NULL DEFAULT '', + `cat_pages` int(11) NOT NULL DEFAULT '0', + `cat_subcats` int(11) NOT NULL DEFAULT '0', + `cat_files` int(11) NOT NULL DEFAULT '0', + `cat_hidden` tinyint(1) unsigned NOT NULL DEFAULT '0', + */ + + String query = "INSERT IGNORE INTO page_category(cat_id,cat_title,cat_pages,cat_subcats,cat_files,cat_hidden) VALUES (?,?,?,?,?,?)"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt(1, cat_id); + ps.setString( 2, cat_title); + ps.setInt(3, cat_pages); + ps.setInt( 4, cat_subcats); + ps.setInt( 5, cat_files); + ps.setBoolean( 6, cat_hidden); + updateQuery = ps.executeUpdate(); + + connection.close(); + + } + catch(SQLException e) + { + e.printStackTrace(); + // return null; + } + + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryLinksDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryLinksDB.java new file mode 100644 index 0000000..a7af666 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryLinksDB.java @@ -0,0 +1,363 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Aug 13, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.categoryprocessor; + + +import java.io.*; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import org.apache.lucene.queryparser.classic.ParseException; +import org.dbpedia.kasun.searcher.Search; + +/** + * TODO- describe the purpose of the class + * + */ +public class CategoryLinksDB +{ + + public static void getCategoryByPageID() throws IOException + { + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + FileWriter outFile; + FileWriter outFile1; + int pageID; + String leafcategory; + + + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + String temp = null; + + + + + // System.out.println(line); + // System.out.println(temp); + + // String query = "SELECT cl_to FROM `categorylinks` WHERE `cl_from` = ? "; + + // String query = "SELECT `cl_to` FROM `category_only_page` JOIN `categorylinks` ON `category_only_page`.`page_id` = `categorylinks`.`cl_from` WHERE `page_title` = '"+leafcategory+"'"; + + try + { + + + File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\page_id_page_title_leaf_categories_page_less_than_90.txt" ); + + String line; + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( catPagesFile ) ); + //FileWriter outFile; + // FileWriter outFileCatNotFound; + + while ( ( line = fileReader.readLine() ) != null ) + { + if ( !line.isEmpty() ) + { + String splitLine[] = line.split( "\t" ); + leafcategory = splitLine[1].trim(); + pageID = Integer.valueOf( splitLine[0] ); + + String query = "SELECT `cl_to` FROM `categorylinks` WHERE `cl_from` = " + splitLine[0].trim(); + + + + + ps = connection.prepareStatement( query ); + // ps.setString( 1, temp ); + // ps.setInt( 1, pageID ); + rs = ps.executeQuery(); + int count = 0; + + if ( rs.next() ) + { + NodeDB.insertNode( pageID, leafcategory ); + // int childID= NodeDB.getCategoryId( leafcategory ); + do + { + //outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\category_match_article_pages.txt", true ); + //outFile.append( rs.getString( "cat_id" ) + "\t" + rs.getString( "cat_title" ) + "\t" + rs.getString( "cat_pages" ) + "\t" + rs.getString( "cat_subcats" ) + "\t" + rs.getString( "cat_files" ) + "\t" + rs.getString( "cat_hidden" ) + "\n" ); + // outFile.close(); + //insertCategory( rs.getInt( "cat_id"), rs.getString( "cat_title" ), rs.getInt( "cat_pages"), rs.getInt( "cat_subcats"), rs.getInt( "cat_files"), rs.getBoolean( "cat_hidden" ) ); + int parentID = PageDB.getPageId( rs.getString( "cl_to" ).trim() ); + if ( parentID > 0 ) + { + NodeDB.insertNode( parentID, rs.getString( "cl_to" ).trim() ); + // int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) ); + + EdgeDB.insertEdge( parentID, pageID ); + } else + { + outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\Parent_child_not_inderted_to_node_table.txt", true ); + outFile1.append( rs.getString( "cl_to" ).trim() + "\n" ); + outFile1.close(); + } + count++; + + } while ( rs.next() ); + } else + { + + outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_pages_not_found_in_page_table.txt", true ); + outFile.append( pageID + "\t" + leafcategory + "\n" ); + outFile.close(); + + //System.out.println( line ); + // No data + } + + System.out.println( count ); + } + } + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + + + + } + + public static void insertParentChild() throws IOException, ParseException + { + + + FileWriter outFile; + FileWriter outFile1; + FileWriter outFile2; + int pageID; + // int catID; + String leafcategory; + + + + int updateQuery = 0; + String temp = null; + + + + + try + { + + + File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\leaf_categories_page_less_than_90.txt" ); + + String line; + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( catPagesFile ) ); + //FileWriter outFile; + // FileWriter outFileCatNotFound; + + while ( ( line = fileReader.readLine() ) != null ) + { + if ( !line.isEmpty() ) + { + String splitLine[] = line.split( "\t" ); + leafcategory = splitLine[1].trim(); + // catID= ; + pageID = PageDB.getPageId( leafcategory ); + + if ( pageID > 0 ) + { + NodeDB.insertNode( pageID, leafcategory ); + + /* + * search index and get the cl_to by pageID + */ + + ArrayList listOfClTo = Search.SearchCatPageLinks( pageID ); + + for ( int i = 0; i < listOfClTo.size(); i++ ) + { + + int parentID = PageDB.getPageId( listOfClTo.get( i ) ); + if ( parentID > 0 ) + { + NodeDB.insertNode( parentID, listOfClTo.get( i ) ); + // int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) ); + + EdgeDB.insertEdge( parentID, pageID ); + } else + { + outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Parent_child_not_inderted_to_node_table_V2.txt", true ); + outFile1.append( listOfClTo.get( i ) + "\n" ); + outFile1.close(); + } + // count++; + + } + } else + { + outFile2 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Child_nodes_not_inderted_to_node_table_V2.txt", true ); + outFile2.append( line + "\n" ); + outFile2.close(); + } + } + } + } catch ( Exception e ) + { + e.printStackTrace(); + // return 0; + } + + + + } + + public static void insertParentChildModified() throws IOException, ParseException + { + + + FileWriter outFile; + FileWriter outFile1; + FileWriter outFile2; + + // int catID; + String leafcategory; + + + + int updateQuery = 0; + String temp = null; + +int count=0; + + + try + { + + + File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\leaf_categories_page_less_than_90_edited_4.txt" ); + + String line; + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( catPagesFile ) ); + //FileWriter outFile; + // FileWriter outFileCatNotFound; + + // HashMap pageMap = PageDB.getAllPages(); + + while ( ( line = fileReader.readLine() ) != null ) + { + if ( !line.isEmpty() ) + { + String splitLine[] = line.split( "\t" ); + leafcategory = splitLine[1].trim(); + // catID= ; + // pageID = PageDB.getPageId( leafcategory ); + int pageID=0; + LinkedList pageIdList= Search.SearchCategoryPages( leafcategory ); + if(!pageIdList.isEmpty() ){ + pageID =pageIdList.get(0); + } + + if ( pageID > 0 ) + { + NodeDB.insertNode( pageID, leafcategory ); + + /* + * search index and get the cl_to by pageID + */ + + ArrayList listOfClTo = Search.SearchCatPageLinks( pageID ); + + for ( int i = 0; i < listOfClTo.size(); i++ ) + { + int parentID = 0; + // int parentID = PageDB.getPageId( listOfClTo.get( i ) ); + + LinkedList parentIdList= Search.SearchCategoryPages( listOfClTo.get( i ) ); + if(!parentIdList.isEmpty() ){ + parentID =parentIdList.get(0); + } + if ( parentID > 0 ) + { + NodeDB.insertNode( parentID, listOfClTo.get( i ) ); + // int parentID= NodeDB.getCategoryId( rs.getString( "cl_to" ) ); + + EdgeDB.insertEdge( parentID, pageID ); + } else + { + outFile1 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Parent_child_not_inderted_to_node_table_V2.txt", true ); + outFile1.append( listOfClTo.get( i ) + "\n" ); + outFile1.close(); + } + // count++; + + } + } else + { + outFile2 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\Child_nodes_not_inderted_to_node_table_V2.txt", true ); + outFile2.append( line + "\n" ); + outFile2.close(); + } + } + count++; + System.out.println(count); + } + } catch ( Exception e ) + { + e.printStackTrace(); + // return 0; + } + + + + } + + public static ArrayList getPagesLinkedByCatName( String catName ) + { + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + ArrayList listOfPages= new ArrayList(); + + String query = "select cl_from from categorylinks where cl_to=?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setString( 1, catName); + + rs = ps.executeQuery(); + + while (rs.next()) + { + listOfPages.add(rs.getInt( "cl_from" ) ); + } + connection.close(); + return listOfPages; + } + catch(SQLException e) + { + e.printStackTrace(); + return null; + } + + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java new file mode 100644 index 0000000..9db297b --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/CategoryProcesor.java @@ -0,0 +1,130 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +package org.dbpedia.kasun.categoryprocessor; + + +import java.io.*; +import java.util.Scanner; +import org.apache.lucene.queryparser.classic.ParseException; + +/** + * + * Date Author Changes Jun 28, 2013 Kasun Perera Created + * + */ +public class CategoryProcesor +{ + + /** + * @param args the command line arguments + */ + public static void main( String[] args ) throws IOException, ParseException + { + + Edges edge= new Edges(); + edge.findProminetNodes(); + // CategoryLinksDB.insertParentChildModified(); + // PageDB.getAllPages(); + + /* + // inser category_only_pages + + //File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_dir\\pages_page_namespace_14_new_complete_line.txt" ); + + + + File catPagesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\leaf_categories_page_less_than_90.txt" ); + + String line; + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( catPagesFile ) ); + //FileWriter outFile; + // FileWriter outFileCatNotFound; + FileWriter outFile = new FileWriter("F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\leaf_categories\\page_id_page_title_leaf_categories_page_less_than_90.txt", true); + + while ( ( line = fileReader.readLine() ) != null ) + { + if ( !line.isEmpty() ) + { + String splitLine[]= line.split("\t"); + int pageId= PageDB.getPageId( splitLine[1].trim() ); + outFile.append( pageId +"\t"+splitLine[1].trim()+"\n" ); + // CategoryLinksDB.getCategoryByPageID( ); + + + } + } + + outFile.close(); + + */ + + + // CategoryDB.getCategoryByName(); + /* + File uniqueCatNamesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_not_found_in_category_table_ca_replaced_part_3.txt" ); + String line; + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( uniqueCatNamesFile ) ); + //FileWriter outFile; + // FileWriter outFileCatNotFound; + + while ( ( line = fileReader.readLine() ) != null ) + { + if ( !line.isEmpty() ) + { + // CategoryDB.getCategoryDirectedByArticlePage(line); + CategoryDB.getCategoryByName(line); + } + } + + */ + + /* + System.out.println("Threshold \t" +"Page Count"); + // TODO code application logic here + + +for(int i=1; i<100000; i++){ + FileWriter outFile = new FileWriter("F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\page_threshold_values.txt", true); + + int pageCount= CategoryDB.getCategoryPageCount( i ); + outFile.append(i+"\t" +pageCount+"\n"); + // System.out.println(i+"\t" +pageCount); + + outFile.close(); +} + */ + /* + + Scanner fileScanner = null; + Scanner childFileScanner= null; + Scanner parentFileScanner= null; + try + { + + //fileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\preview.txt" ) ).useDelimiter("\\>*.\\<*"); + // fileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\preview.txt" ) ); + fileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\dbpedia_categories\\article_categories_en.nt" ) ); + + DataProcesor.inserDataToDB( fileScanner ); + parentFileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\parents.txt" )); + childFileScanner = new Scanner( new File( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\all_children.txt") ); + + Node.sortChildren( parentFileScanner, childFileScanner ); + + } catch ( FileNotFoundException e ) + { + e.printStackTrace(); + } + + */ + //read category file and insert data to the database + + + //read leaf node file and update the database + + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DB_connection.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DB_connection.java new file mode 100644 index 0000000..3add8d0 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DB_connection.java @@ -0,0 +1,30 @@ +/* + * DO NOT MODIFY THIS FILE (it is already completed and should not be changed). + */ + +package org.dbpedia.kasun.categoryprocessor; +import java.sql.*; + +public class DB_connection { + public DB_connection() {}; + // "jdbc:mysql://localhost:3306/TweetComparison","root","nbuser" + //public Connection dbConnect(String db_connect_string, String db_userid, String db_password) { + public Connection dbConnect() { + + Connection conn = null; + try { + Class.forName("com.mysql.jdbc.Driver").newInstance(); + //conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/kasun","kasun","kasun_perrera_kk"); + conn = DriverManager.getConnection("jdbc:mysql://localhost:3306/wiki_categories","root","nbuser"); + } catch (InstantiationException e) { + e.printStackTrace(); + } catch (IllegalAccessException e) { + e.printStackTrace(); + } catch (ClassNotFoundException e) { + e.printStackTrace(); + } catch (SQLException e) { + e.printStackTrace(); + } + return conn; + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java new file mode 100644 index 0000000..c1c10eb --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/DataProcesor.java @@ -0,0 +1,77 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * + * Date Author Changes + * Jun 29, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.categoryprocessor; + + +import java.io.FileWriter; +import java.io.IOException; +import java.util.Scanner; + + + +/** + * TODO- describe the purpose of the class + * + */ +public class DataProcesor { + + public static void inserDataToDB(Scanner fileScanner) throws IOException{ + + FileWriter outFile1 ; + FileWriter outFile2 ; + String line; + while ( fileScanner.hasNextLine() ) + { + // System.out.println(fileScanner.nextLine()); + //split the line by space, will get triples separated + line=fileScanner.nextLine(); + String[] typle=line.split("\\ "); + int parentId; + int childId; + + if(!typle[0].trim().equals("#")&&typle.length>2){ + // begin index=28 + String parent= typle[0].substring( 29, typle[0].length()-1 ); + // + String child= typle[2].substring( 38, typle[2].length()-1 ); + // System.out.println( "Line: " +line); + // System.out.println( "Parent: "+parent+" "+"child: "+ child ); + outFile1 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\parents.txt", true ); + outFile2 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\all_children.txt", true ); + + + //insert parent and child to the node- duplicate enties are handle by the SQL + // NodeDB.insertNode( parent ); + + outFile1.append(parent+"\n"); + // NodeDB.insertNode( child); + outFile2.append(child+"\n"); + //get child and parent Ids + parentId=NodeDB.getCategoryId( parent ); + childId= NodeDB.getCategoryId( child); + + // + EdgeDB.insertEdge( parentId, childId ); + + outFile1.close(); + outFile2.close(); + } + + + } + + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/EdgeDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/EdgeDB.java new file mode 100644 index 0000000..c706c59 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/EdgeDB.java @@ -0,0 +1,203 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * + * Date Author Changes + * Jun 29, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.categoryprocessor; + + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; + + + +/** + * TODO- describe the purpose of the class + * + */ +public class EdgeDB { + + public static void insertEdge(int parentId, int chidId){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "INSERT IGNORE INTO edges(parent_id,child_id) VALUES (?, ?)"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt(1, parentId); + ps.setInt(2, chidId); + updateQuery = ps.executeUpdate(); + +// while (rs.next()) +// { +// } + + } + catch(SQLException e) + { + e.printStackTrace(); + // return null; + } + + } + + public static ArrayList getChildren(int parenId){ + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + //TO-DO rewrite the query + String query = "SELECT child_id FROM edges WHERE parent_id=?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt( 1, parenId); + + rs = ps.executeQuery(); + + + ArrayList childrenList= new ArrayList(); + + + while (rs.next()) + { + childrenList.add(rs.getInt("child_id")); + } + return childrenList; + } + catch(SQLException e) + { + e.printStackTrace(); + return null; + } + + + } + public static ArrayList getParent(int leafNode){ + + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + //TO-DO rewrite the query + String query = "SELECT parent_id FROM edges WHERE child_id =?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt( 1, leafNode); + + rs = ps.executeQuery(); + + + ArrayList parents= new ArrayList(); + + + while (rs.next()) + { + parents.add(rs.getInt(1)); + } + return parents; + } + catch(SQLException e) + { + e.printStackTrace(); + return null; + } + + + + + + } + + public static ArrayList getChilren(int parentId){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "select parent_id,child_id from edges where parent_id=?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt(1, parentId); + + rs = ps.executeQuery(); + ArrayList chidId= new ArrayList(); + while (rs.next()) + { + chidId.add(rs.getInt("child_id") ); + } + return chidId; + } + catch(SQLException e) + { + e.printStackTrace(); + return null; + } + + } + + public static ArrayList getDisinctleafNodes(){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "SELECT distinct `child_id` FROM edges WHERE `child_id` NOT IN (SELECT `parent_id` FROM edges )"; + + + try + { + ps = connection.prepareStatement(query); + // ps.setInt(1, parentId); + + rs = ps.executeQuery(); + ArrayList leafId= new ArrayList(); + while (rs.next()) + { + leafId.add(rs.getInt("child_id") ); + } + return leafId; + } + catch(SQLException e) + { + e.printStackTrace(); + return null; + } + + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Edges.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Edges.java new file mode 100644 index 0000000..8542932 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Edges.java @@ -0,0 +1,122 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * Date Author Changes Jun 29, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.categoryprocessor; + + +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; + +/** + * TODO- describe the purpose of the class + * + */ +public class Edges +{ + + ArrayList leafNodes = new ArrayList(); + + private int parentId; + + private int childId; + + public int getChildId() + { + return this.childId; + } + + public int getParentId() + { + return this.parentId; + } + + public void setParentId( int parentId ) + { + this.parentId = parentId; + } + + public void setChildId( int childId ) + { + this.childId = childId; + } + + public void findProminetNodes( ) throws IOException + { + // input leaf nodelit as a file to enhance memoery useage + //all leaf nodes + + HashSet prominetNodeList= new HashSet(); + + + //get all leaf nodes + leafNodes=EdgeDB.getDisinctleafNodes(); + + + //creating a clode of leafnodes + ArrayList leafNodesClone = new ArrayList( leafNodes.size() ); + for ( Integer p : leafNodes ) + { + leafNodesClone.add( p ); + } + + + for ( int i = 0; i < leafNodes.size(); i++ ) + { + + //to check whether leaf becomes prominet node + boolean isLeafProminent=true; + + //To-Do here need to remove the leaf nodes added from the arry list + + //get parents of the selected leafnode(there could be one or more parents) + ArrayList parentId = EdgeDB.getParent( leafNodes.get( i ) ); + + for ( int j = 0; j < parentId.size(); j++ ) + { + //get the children of parent node and check all children are leaf nodes + ArrayList childnodes = EdgeDB.getChildren( parentId.get( j ) ); + + //boolean prominentNode = isProminent( childnodes ); + //check whether all children are leafs + if(isLeaf( childnodes )){ + + //duplicates automatically removed + prominetNodeList.add( parentId.get( j ) ); + isLeafProminent=false; + + } + } + + if(isLeafProminent){ + prominetNodeList.add( leafNodes.get( i ) ); + } + } + + + // FileWriter outFile4 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\promiment_nodes.txt", true ); + //insert this in to the database + NodeDB.updateProminetNode(prominetNodeList ); + + + } + + private boolean isLeaf( ArrayList childnodes ) + { + boolean status = true; + for ( int k = 0; k < childnodes.size(); k++ ) + { + if ( !leafNodes.contains(childnodes.get( k ) ) ) + { + status = false; + break; + } + } + return status; + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Node.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Node.java new file mode 100644 index 0000000..595b8a5 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Node.java @@ -0,0 +1,133 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Jun 29, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.categoryprocessor; + + +import java.io.FileWriter; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Scanner; + +/** + * TODO- describe the purpose of the class + * + */ +public class Node +{ + + private int nodeId; + + private String categoryName; + + private boolean isProminent; + + private boolean isLeaf; + + private double scoreInterLangu; + + private double scoreEditHisto; + + public void setNodeId( int nodeId ) + { + this.nodeId = nodeId; + } + + public void setCategoryName( String catName ) + { + this.categoryName = catName; + } + + public void setIsProminent( boolean value ) + { + this.isProminent = value; + } + + public void setIsLeaf( boolean value ) + { + this.isLeaf = value; + } + + public void setScoreInterlangu(double score){ + this.scoreInterLangu=score; + } + + public void setScoreEditHisto(double score){ + this.scoreEditHisto=score; + } + + public int getNodeId() + { + return this.nodeId; + } + + public String getCategoryName() + { + return this.categoryName; + } + + public boolean getIsProminent() + { + return this.isProminent; + } + + public boolean getIsLeaf() + { + return this.isLeaf; + } + + public double getScoreInterlangu(){ + return this.scoreInterLangu; + } + + public double getScoreEditHisto(){ + return this.scoreEditHisto; + } + + public static void sortChildren(Scanner parentFileScanner,Scanner childFileScanner) throws IOException{ + String line; + + //TO-DO use a HashSet for this + HashMap parentMap= new HashMap(); + HashMap childMap= new HashMap(); + while ( parentFileScanner.hasNextLine() ) + { + // System.out.println(fileScanner.nextLine()); + //split the line by space, will get triples separated + line=parentFileScanner.nextLine(); + parentMap.put( line, line ); + } + + while ( childFileScanner.hasNextLine() ) + { + // System.out.println(fileScanner.nextLine()); + //split the line by space, will get triples separated + line=childFileScanner.nextLine(); + childMap.put( line, line ); + } + + for(Map.Entry entry : parentMap.entrySet()){ + + if(childMap.containsKey( (String)entry.getKey() ) ){ + childMap.remove((String)entry.getKey()); + } + + } + + FileWriter outFile3 = new FileWriter( "F:\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\program_outputs\\leaf_nodes.txt", true ); + + for(Map.Entry entry : childMap.entrySet()){ + //TO_DO write this data to the database + outFile3.append((String)entry.getKey()+"\n"); + } + outFile3.close(); + + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java new file mode 100644 index 0000000..ca72e04 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/NodeDB.java @@ -0,0 +1,268 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * + * Date Author Changes + * Jun 29, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.categoryprocessor; + + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.HashSet; + + + +/** + * TODO- describe the purpose of the class + * + */ +public class NodeDB { + + public static void insertNode( int nodeID, String categoryName){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "INSERT IGNORE INTO node(node_id,category_name,is_leaf,is_prominent) VALUES (?,?,?,?)"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt( 1, nodeID); + ps.setString( 2, categoryName); + ps.setBoolean( 3, false); + ps.setBoolean( 4, false); + updateQuery = ps.executeUpdate(); + +// while (rs.next()) +// { +// } + + } + catch(SQLException e) + { + e.printStackTrace(); + // return null; + } + + } + + public static int getCategoryId(String cateName){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "select node_id,category_name from node where category_name=?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setString( 1, cateName); + + rs = ps.executeQuery(); + int nodeId=0; + while (rs.next()) + { + nodeId=rs.getInt("node_id"); + } + return nodeId; + } + catch(SQLException e) + { + e.printStackTrace(); + return 0; + } + + } + + + public static String getCategoryName(int categoryId){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "select category_name from node where node_id=?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt( 1, categoryId); + + rs = ps.executeQuery(); + String nodeName = null; + while (rs.next()) + { + nodeName=rs.getString( "category_name"); + } + return nodeName; + } + catch(SQLException e) + { + e.printStackTrace(); + return null; + } + + } + + public static void updateNode(ArrayList categoryName){ + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "UPDATE node SET is_leaf=? WHERE category_name=?"; + + + try + { + for(int i=0; i prominentNodes){ + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "UPDATE node SET is_prominent=? WHERE node_id=?"; + + + try + { + for (Integer i : prominentNodes) { + ps = connection.prepareStatement(query); + ps.setBoolean( 1, true); + ps.setInt( 2, i ); + updateQuery = ps.executeUpdate(); + } +// while (rs.next()) +// { +// } + + } + catch(SQLException e) + { + e.printStackTrace(); + // return null; + } + + } + + public static ArrayList getCategoriesByHead( String head ) + { + ArrayList categoryList =new ArrayList(); + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "select node_id, category_name from node where head_of_name=?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setString( 1, head); + + rs = ps.executeQuery(); + + while (rs.next()) + { + categoryList.add( rs.getString( "category_name") ); + //nodeName=rs.getString( "category_name"); + } + connection.close(); + return categoryList; + } + catch(SQLException e) + { + e.printStackTrace(); + return null; + } + + + } + + static void updateProminetNode( Integer s ) + { + throw new UnsupportedOperationException( "Not yet implemented" ); + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Page.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Page.java new file mode 100644 index 0000000..e7e9cd0 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/Page.java @@ -0,0 +1,51 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Sep 17, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.categoryprocessor; + +/** + * TODO- describe the purpose of the class + * + */ +public class Page +{ + + private int pageId; + + private String pageName; + + private int pageNameSpace; + + public void setPageID( int pageID ) + { + this.pageId = pageID; + } + + public void setPageNameSapce( int pageNameSpace ) + { + this.pageNameSpace = pageNameSpace; + } + + public void setPageName( String pageName ) + { + this.pageName = pageName; + } + + public int getPageID(){ + return this.pageId; + } + + public int getPageNamespace(){ + return this.pageNameSpace; + } + + public String getPageName(){ + return this.pageName; + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/PageDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/PageDB.java new file mode 100644 index 0000000..e561ccc --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/PageDB.java @@ -0,0 +1,296 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * Date Author Changes + * Aug 3, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.categoryprocessor; + + +import java.io.*; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.HashMap; +import java.util.Map; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.IntField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.NIOFSDirectory; +import org.apache.lucene.util.Version; + + + +/** + * TODO- describe the purpose of the class + * + */ +public class PageDB { + + public static boolean isArticlePage(int pageId){ + boolean state= false; + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + + + String lineArr[]; + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + + // System.out.println(line); + // System.out.println(temp); + String query = "SELECT page_namespace FROM `page` WHERE `page_id` = " + pageId ; + + + try + { + ps = connection.prepareStatement( query ); + // ps.setString( 1, temp ); + //ps.setString( 1, catTitle ); + rs = ps.executeQuery(); + int count = 0; + + while( rs.next() ) + { + if(rs.getInt( "page_namespace" )== 0 ){ + state=true; + break; + } + } + + + + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + + + return state; + } + + public static int getPageId(String catPageTitle){ + int resultId = 0; + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + + + String lineArr[]; + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + + // System.out.println(line); + // System.out.println(temp); + + String query = "SELECT page_id FROM `category_only_page` WHERE `page_title` = '" + catPageTitle+"'" ; + + + try + { + ps = connection.prepareStatement( query ); + // ps.setString( 1, temp ); + //ps.setString( 1, catTitle ); + rs = ps.executeQuery(); + int count = 0; + + while( rs.next() ) + { + resultId= rs.getInt("page_id"); + } + + + + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + + + return resultId; + } + public static HashMap getAllPages() throws IOException{ + int resultId = 0; + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + HashMap pagesMap= new HashMap(); + + String lineArr[]; + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + + // System.out.println(line); + // System.out.println(temp); + + String query = "SELECT page_id, page_title FROM `category_only_page`" ; + + + try + { + + + String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\categoty_page_candidate_index"; + int noOfDocs = 0; + + IndexWriter iW; + + NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) ); + //dir = new RAMDirectory() ; + iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) ); + + ps = connection.prepareStatement( query ); + // ps.setString( 1, temp ); + //ps.setString( 1, catTitle ); + rs = ps.executeQuery(); + int count = 0; + + while( rs.next() ) + { + + + + Document doc = new Document(); + + + + + doc.add( new TextField( "page_title", rs.getString( "page_title" ), Field.Store.YES ) ); + doc.add( new IntField( "page_id", rs.getInt("page_id"), Field.Store.YES ) ); + + iW.addDocument( doc ); + + + + + // pagesMap.put( rs.getString( "page_title" ), rs.getInt("page_id") ); + // System.out.println(pagesMap.size()); + // resultId= rs.getInt("page_id"); + } + iW.close(); + dir.close(); + + + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + + + return pagesMap; + } + + + public static void insertCategoryPage( String data){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + + String query = "INSERT IGNORE INTO category_only_page(page_id,page_namespace,page_title,page_restrictions, page_counter,page_is_redirect, page_is_new, page_random, page_touched,page_latest,page_len) VALUES ("+data+")"; + + + + + try + { + ps = connection.prepareStatement(query); + + updateQuery = ps.executeUpdate(); + +// while (rs.next()) +// { +// } + + } + catch(SQLException e) + { + + System.out.println(data); + // e.printStackTrace(); + // return null; + } + + } + + public static Page getPagebyID( int pageId ) + { + + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + + PreparedStatement ps = null; + ResultSet rs = null; + int updateQuery = 0; + Page page= new Page(); + + // System.out.println(line); + // System.out.println(temp); + + String query = "SELECT page_id,page_namespace,page_title FROM page WHERE page_id = ?" ; + + + try + { + ps = connection.prepareStatement( query ); + ps.setInt( 1, pageId ); + //ps.setString( 1, catTitle ); + rs = ps.executeQuery(); + + while( rs.next() ) + { + + page.setPageID(rs.getInt("page_id")); + page.setPageNameSapce( rs.getInt("page_namespace")); + page.setPageName( rs.getString("page_title") ); + + } + + + + + connection.close(); + } catch ( SQLException e ) + { + e.printStackTrace(); + // return 0; + } + + + return page; + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/languageLinksDB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/languageLinksDB.java new file mode 100644 index 0000000..d339297 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/categoryprocessor/languageLinksDB.java @@ -0,0 +1,64 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * + * Date Author Changes + * Aug 31, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.categoryprocessor; + + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; + + + +/** + * Communications with the languagelinks table + * + */ +public class languageLinksDB { + + public static int getLanguageLinksCount(int pageId){ + DB_connection con = new DB_connection(); + Connection connection = con.dbConnect(); + PreparedStatement ps = null; + ResultSet rs = null; + // int updateQuery = 0; + + String query = "select count(*) from langlinks where ll_from=?"; + + + try + { + ps = connection.prepareStatement(query); + ps.setInt( 1, pageId); + + rs = ps.executeQuery(); + int nodeId=0; + while (rs.next()) + { + nodeId=rs.getInt(1); + } + return nodeId; + } + catch(SQLException e) + { + e.printStackTrace(); + return 0; + } + + } + + + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DataExtractor.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DataExtractor.java new file mode 100644 index 0000000..10c7e63 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DataExtractor.java @@ -0,0 +1,72 @@ + + + +/** + * + * + * Date Author Changes + * Jul 16, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.extractdata; + + +import java.io.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + + +/** + * Methods of this class extract data from the Wikipedia SQL dumps + * + */ +public class DataExtractor { + + +public static void main(String[] args ) throws FileNotFoundException, IOException{ + String line; + + /* + * enwiki-20130604-page.sql- data line start at line #49 + * enwiki-20130604-categorylinks.sql data line start at line #43 + * enwiki-20130604-category.sql data line start at line #42 + * enwiki-20130604-langlinks.sql data line start at line #39 + * change "int count" variable according to the data line for each SQl dump file + */ + File categoryLinksDumpFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\Wiki_Category_SQL_tables\\enwiki-20130604-langlinks.sql" ); + File outCategoryLinksDumpFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-langlinks_typles.txt"); + + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( categoryLinksDumpFile ) ); + int count=0; + while ((line = fileReader.readLine())!=null ) + { + // System.out.println(line); + + + if(count>=39){ + FileWriter outFile2 = new FileWriter(outCategoryLinksDumpFile,true); + // System.out.println("#############################################################"); + + String[] strArr = line.split("\\)\\,\\("); + for(int i=0;i< strArr.length;i++){ + if(i==0){ + String[] strArr2= strArr[0].split("\\(",2) ; + outFile2.append(strArr2[1]+"\n"); + //System.out.println( strArr2[1]); + } + else{ + outFile2.append(strArr[i]+"\n"); + // System.out.println( strArr[i]); + } + } + outFile2.close(); + } + // String[] strArr = line.split( "\t" ); + count++; + + } +} +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DoSearch.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DoSearch.java new file mode 100644 index 0000000..d8863a3 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/extractdata/DoSearch.java @@ -0,0 +1,98 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Jul 18, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.extractdata; + + +import java.io.*; +import org.dbpedia.kasun.indexer.Index; +import org.dbpedia.kasun.searcher.Search; + +/** + * TODO- describe the purpose of the class + * + */ +public class DoSearch +{ + + public static void main( String[] args ) throws IOException, Exception + { + //page + // String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index"; + /* + * categorylinks + * + */ + // String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\category"; +// File categoryTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" ); + + /* + * languagelinks + */ + + /* + String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\language_links"; + File tuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-langlinks_typles.txt" ); + Index.indexInterLanguageLinks( pathToIndex, tuplesFile ); + */ + /* + * category_page_links_view + */ + String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\category_page_links_view"; + File tuplesFile = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_dir\\category_page_links_view\\page_id_cl_to.txt" ); + Index.indexCategoryPageLinksView( pathToIndex, tuplesFile ); + + + + + + //Index.indexCategory( pathToIndex, categoryTuplesFile ); + +/* + + String pathToIndex1 ="C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index1"; + String pathToIndex2= "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index2"; + String pathToIndex3 = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index3"; + String pathToIndex4 ="C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index4"; + String pathToIndex5= "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index5"; + String pathToIndex6 = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index6"; + + //page tuples + File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-page_typles.txt" );File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-page_typles.txt" ); + + + + // File ctLinksTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-categorylinks_typles.txt" ); + + + // Index.indexPage(pathToIndex1,pathToIndex2,pathToIndex3,pathToIndex4,pathToIndex5,pathToIndex6,pageTuplesFile); +//Index.indexPage2(pageTuplesFile); + + // Index.indexCategoryLinks( pathToIndex, ctLinksTuplesFile ); + + + // FileWriter outFile = new FileWriter("C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0.txt",true); + + /* + * String page_q="0"; String page_field="page_namespace"; Search.searchPage( new File(pathToIndex),page_q, + * page_field,25000000); + */ + + + + // String cateLinksField = "cl_from"; + + + // Search.searchCategoryLinks( new File( pathToIndex ), cateLinksField, 200 ); + + String cateLinksField = "cat_title"; + //Search.searchCategory( new File( pathToIndex ), cateLinksField, 2 ); + + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/Mqlread.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/Mqlread.java new file mode 100644 index 0000000..1c3ffe6 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/Mqlread.java @@ -0,0 +1,74 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * Date Author Changes + * Sep 16, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.freebasequery; + + + +/** + * TODO- describe the purpose of the class + * + */ +import com.google.api.client.http.GenericUrl; +import com.google.api.client.http.HttpRequest; +import com.google.api.client.http.HttpRequestFactory; +import com.google.api.client.http.HttpResponse; +import com.google.api.client.http.HttpTransport; +import com.google.api.client.http.javanet.NetHttpTransport; +import com.jayway.jsonpath.JsonPath; +import java.io.FileInputStream; +import java.util.Properties; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; + +public class Mqlread { + public static Properties properties = new Properties(); + public static void main(String[] args) { + String curcer=curcerQuery(""); + while(curcer!="FALSE"){ + curcer=curcerQuery(curcer); + } + } + + private static String curcerQuery(String curcer){ + + String newCurcer = null; + try { + // properties.load(new FileInputStream("freebase.properties")); + HttpTransport httpTransport = new NetHttpTransport(); + HttpRequestFactory requestFactory = httpTransport.createRequestFactory(); + JSONParser parser = new JSONParser(); + String query = "[{\"id\":null,\"name\":null,\"type\":\"/people/person\",\"limit\":100}]"; + GenericUrl url = new GenericUrl("https://www.googleapis.com/freebase/v1/mqlread"); + url.put("query", query); + // url.put("key", properties.get("API_KEY")); + url.put("key","AIzaSyDcHfGTZlVm0KE4KKK9JAM61KBDaXtPiJc"); + url.put("cursor", curcer); + HttpRequest request = requestFactory.buildGetRequest(url); + HttpResponse httpResponse = request.execute(); + JSONObject response = (JSONObject)parser.parse(httpResponse.parseAsString()); + JSONArray results = (JSONArray)response.get("result"); + newCurcer=(String)response.get("cursor"); + + for (Object result : results) { + System.out.println(JsonPath.read(result,"$.name").toString()); + // System.out.println( newCurcer); + } + } catch (Exception ex) { + ex.printStackTrace(); + } + + return newCurcer; + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/QueryFB.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/QueryFB.java new file mode 100644 index 0000000..54b142d --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/freebasequery/QueryFB.java @@ -0,0 +1,71 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * + * Date Author Changes + * Sep 16, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.freebasequery; + +import com.google.api.client.http.GenericUrl; +import com.google.api.client.http.HttpRequest; +import com.google.api.client.http.HttpRequestFactory; +import com.google.api.client.http.HttpResponse; +import com.google.api.client.http.HttpTransport; +import com.google.api.client.http.javanet.NetHttpTransport; +import com.jayway.jsonpath.JsonPath; +import java.io.FileInputStream; +import java.util.Properties; +import org.json.simple.JSONArray; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; + +/** + * TODO- describe the purpose of the class + * + */ +public class QueryFB { + + public static Properties properties = new Properties(); + public static void main(String[] args) { + int count = 0; + try { + // properties.load(new FileInputStream("freebase.properties")); + HttpTransport httpTransport = new NetHttpTransport(); + HttpRequestFactory requestFactory = httpTransport.createRequestFactory(); + JSONParser parser = new JSONParser(); + GenericUrl url = new GenericUrl("https://www.googleapis.com/freebase/v1/search"); + // url.put("query", "Cee Lo Green"); + //url.put("filter", "(all type:/music/artist created:\"The Lady Killer\")"); + // url.put("filter", "(all type:/people/person)"); + // url.put("filter", "(all type:/location/location)"); + url.put("filter", "(all type:/organization/organization)"); + + + url.put("cursor", "0"); + url.put("limit", "160"); + url.put("indent", "true"); + // url.put("key", properties.get("API_KEY")); + url.put("key","AIzaSyDcHfGTZlVm0KE4KKK9JAM61KBDaXtPiJc"); + HttpRequest request = requestFactory.buildGetRequest(url); + HttpResponse httpResponse = request.execute(); + JSONObject response = (JSONObject)parser.parse(httpResponse.parseAsString()); + JSONArray results = (JSONArray)response.get("result"); + for (Object result : results) { + count++; + System.out.println(JsonPath.read(result,"$.name").toString()); + } + System.out.println("total: "+ count); + } catch (Exception ex) { + ex.printStackTrace(); + } + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/indexer/Index.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/indexer/Index.java new file mode 100644 index 0000000..96d0f2f --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/indexer/Index.java @@ -0,0 +1,415 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Jul 17, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.indexer; + + +import java.io.*; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +//import org.apache.lucene.analysis.; +import org.apache.lucene.document.*; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.NIOFSDirectory; +import org.apache.lucene.util.Version; + +/** + * TODO- describe the purpose of the class + * + */ +public class Index +{ + + public static void indexPage( String pathToIndex, File pageTuplesFile ) throws IOException + { + int noOfDocs = 0; + + IndexWriter iW; + + try + { + // NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) ); + //dir = new RAMDirectory() ; + // iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) ); + + + + + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( pageTuplesFile ) ); + int count = 0; + String line; + FileWriter outFile; + while ( ( line = fileReader.readLine() ) != null ) + { + + String[] strArr = line.split( "\\," ); + if ( strArr.length >= 3 ) + { +// StringReader page_id = new StringReader( strArr[0] ); +// StringReader page_namespace = new StringReader( strArr[1] ); +// StringReader page_title = new StringReader( strArr[2] ); + //System.out.println(strArr[0]+strArr[1]+strArr[2]); + + if ( strArr[1].trim() == "0" ) + { + outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_14_new.txt", true ); + + + outFile.append( strArr[0] + "\t" + strArr[1] + "\t" + strArr[2] + "\n" ); + //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title")); + outFile.close(); + } + /* + * Document doc = new Document(); + * + * doc.add( new TextField( "page_id", strArr[0], Field.Store.YES ) ); doc.add( new TextField( + * "page_namespace", strArr[1], Field.Store.YES ) ); doc.add( new TextField( "page_title", + * strArr[2], Field.Store.YES ) ); + * + * + * iW.addDocument( doc ); + */ + + } else + { + System.out.println( line + "\n" ); + } + + count++; + } + + + // iW.close(); + // dir.close(); + + } catch ( CorruptIndexException e ) + { + e.printStackTrace(); + } catch ( IOException e ) + { + e.printStackTrace(); + } + } + + public static void readPageTable( File pageTuplesFile ) throws IOException + { + int noOfDocs = 0; + + IndexWriter iW; + + try + { + + + + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( pageTuplesFile ) ); + int count = 0; + String line; + FileWriter outFile; + while ( ( line = fileReader.readLine() ) != null ) + { + + String[] strArr = line.split( "\\," ); + if ( strArr.length >= 3 ) + { + + + if ( strArr[1].trim().equals( "0" ) ) + { + outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0_new.txt", true ); + + + outFile.append( strArr[0] + "\t" + strArr[1] + "\t" + strArr[2] + "\n" ); + //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title")); + outFile.close(); + } + + + } else + { + System.out.println( line + "\n" ); + } + + count++; + } + + + + } catch ( CorruptIndexException e ) + { + e.printStackTrace(); + } catch ( IOException e ) + { + e.printStackTrace(); + } + } + + public static void indexCategoryLinks( String pathToIndex, File tuplesFile ) throws IOException + { + //String pathToIndex = "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\page_index"; + int noOfDocs = 0; + + IndexWriter iW; + try + { + NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) ); + //dir = new RAMDirectory() ; + iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) ); + + + // File tuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-langlinks_typles.txt" ); + + + + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( tuplesFile ) ); + int count = 0; + String line; + + while ( ( line = fileReader.readLine() ) != null ) + { + + String[] strArr = line.split( "\\," ); + //`cl_from` ,`cl_to`,`cl_sortkey`,`cl_timestamp`,`cl_sortkey_prefix`,`cl_collation`,`cl_type` enum('page','subcat','file') NOT NULL DEFAULT 'page', + + if ( strArr.length >= 7 ) + { + + Document doc = new Document(); + + + + + doc.add( new TextField( "cl_from", strArr[0], Field.Store.YES ) ); + doc.add( new TextField( "cl_to", strArr[1], Field.Store.YES ) ); + doc.add( new TextField( " cl_sortkey", strArr[2], Field.Store.YES ) ); + + doc.add( new TextField( "cl_type", strArr[6], Field.Store.YES ) ); + iW.addDocument( doc ); + } else + { + System.out.println( line + "\n" ); + } + } + + + iW.close(); + dir.close(); + } catch ( CorruptIndexException e ) + { + e.printStackTrace(); + } catch ( IOException e ) + { + e.printStackTrace(); + } + } + + public static void indexCategory( String pathToIndex, File tuplesFile ) throws IOException + { + //String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\page_index"; + int noOfDocs = 0; + + IndexWriter iW; + try + { + NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) ); + //dir = new RAMDirectory() ; + iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) ); + + + //File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" ); + + + + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( tuplesFile ) ); + int count = 0; + String line; + + while ( ( line = fileReader.readLine() ) != null ) + { + + String[] strArr = line.split( "\\," ); + //Data in following order`cat_id`,`cat_title`,`cat_pages`,`cat_subcats` + //we need 0,1,2,3 elements of the string + if ( strArr.length >= 2) + { + + // System.out.println(strArr[0]+"####"+strArr[1]+"####"+strArr[2]+"#####"+strArr[3]+"###"+strArr[4]); + Document doc = new Document(); + + + + + doc.add( new TextField( "cat_id", strArr[0], Field.Store.YES ) ); + doc.add( new TextField( "cat_title", strArr[1], Field.Store.YES ) ); + // doc.add( new IntField( "cat_pages", Integer.parseInt( strArr[2].trim() ), Field.Store.YES ) ); + // doc.add( new IntField( "cat_subcats", Integer.parseInt( strArr[3].trim() ), Field.Store.YES ) ); + // doc.add( new IntField( "cat_files", Integer.parseInt( strArr[4].trim() ), Field.Store.YES ) ); + // doc.add( new TextField( "cat_hidden", strArr[5].substring( 0,1), Field.Store.YES ) ); + + + + iW.addDocument( doc ); + } else + { + System.out.println( line + "\n" ); + } + } + + + iW.close(); + dir.close(); + } catch ( CorruptIndexException e ) + { + e.printStackTrace(); + } catch ( IOException e ) + { + e.printStackTrace(); + } + } + + + public static void indexCategoryPageLinksView( String pathToIndex, File tuplesFile ) throws IOException + { + //String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\language_links"; + int noOfDocs = 0; + + IndexWriter iW; + try + { + NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) ); + //dir = new RAMDirectory() ; + iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) ); + + + //File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" ); + + + + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( tuplesFile ) ); + int count = 0; + String line; + + while ( ( line = fileReader.readLine() ) != null ) + { + + String[] strArr = line.split( "\\t",2 ); + //Data in following order`cat_id`,`cat_title`,`cat_pages`,`cat_subcats` + //we need 0,1,2,3 elements of the string + if ( strArr.length >= 2) + { + + // System.out.println(strArr[0]+"####"+strArr[1]+"####"+strArr[2]+"#####"+strArr[3]+"###"+strArr[4]); + Document doc = new Document(); + + + + + doc.add( new TextField( "page_id", strArr[0].trim(), Field.Store.YES ) ); + doc.add( new TextField( "page_title", strArr[1], Field.Store.YES ) ); + + // doc.add( new IntField( "cat_subcats", Integer.parseInt( strArr[3].trim() ), Field.Store.YES ) ); + // doc.add( new IntField( "cat_files", Integer.parseInt( strArr[4].trim() ), Field.Store.YES ) ); + // doc.add( new TextField( "cat_hidden", strArr[5].substring( 0,1), Field.Store.YES ) ); + + + + iW.addDocument( doc ); + } else + { + System.out.println( line + "\n" ); + } + } + + + iW.close(); + dir.close(); + } catch ( CorruptIndexException e ) + { + e.printStackTrace(); + } catch ( IOException e ) + { + e.printStackTrace(); + } + } + + + + public static void indexInterLanguageLinks( String pathToIndex, File tuplesFile ) throws IOException + { + //String pathToIndex = "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\index_dir\\language_links"; + int noOfDocs = 0; + + IndexWriter iW; + try + { + NIOFSDirectory dir = new NIOFSDirectory( new File( pathToIndex ) ); + //dir = new RAMDirectory() ; + iW = new IndexWriter( dir, new IndexWriterConfig( Version.LUCENE_43, new WhitespaceAnalyzer( Version.LUCENE_43 ) ) ); + + + //File pageTuplesFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\typles_out\\enwiki-20130604-category_typles.txt" ); + + + + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( tuplesFile ) ); + int count = 0; + String line; + + while ( ( line = fileReader.readLine() ) != null ) + { + + String[] strArr = line.split( "\\,",3 ); + //Data in following order`cat_id`,`cat_title`,`cat_pages`,`cat_subcats` + //we need 0,1,2,3 elements of the string + if ( strArr.length >= 3) + { + + // System.out.println(strArr[0]+"####"+strArr[1]+"####"+strArr[2]+"#####"+strArr[3]+"###"+strArr[4]); + Document doc = new Document(); + + + + + doc.add( new TextField( "ll_from", strArr[0].trim(), Field.Store.YES ) ); + doc.add( new TextField( "ll_lang", strArr[1], Field.Store.YES ) ); + doc.add( new TextField( "ll_title", strArr[2] , Field.Store.YES ) ); + // doc.add( new IntField( "cat_subcats", Integer.parseInt( strArr[3].trim() ), Field.Store.YES ) ); + // doc.add( new IntField( "cat_files", Integer.parseInt( strArr[4].trim() ), Field.Store.YES ) ); + // doc.add( new TextField( "cat_hidden", strArr[5].substring( 0,1), Field.Store.YES ) ); + + + + iW.addDocument( doc ); + } else + { + System.out.println( line + "\n" ); + } + } + + + iW.close(); + dir.close(); + } catch ( CorruptIndexException e ) + { + e.printStackTrace(); + } catch ( IOException e ) + { + e.printStackTrace(); + } + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/rdf/RdfGenarator.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/rdf/RdfGenarator.java new file mode 100644 index 0000000..cc56658 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/rdf/RdfGenarator.java @@ -0,0 +1,143 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Sep 17, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.rdf; + + +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.dbpedia.kasun.categoryprocessor.CategoryLinksDB; +import org.dbpedia.kasun.categoryprocessor.NodeDB; +import org.dbpedia.kasun.categoryprocessor.Page; +import org.dbpedia.kasun.categoryprocessor.PageDB; + +/** + * TODO- describe the purpose of the class + * + */ +public class RdfGenarator +{ + + private static String promintNodeName; + + public static void getCategoriesForHead( String head ) + { + + ArrayList categoriesForHead = NodeDB.getCategoriesByHead( head ); + +for(int j=0; j clFromPageID = CategoryLinksDB.getPagesLinkedByCatName( catName ); + FileWriter outfile; + + for ( int i = 0; i < clFromPageID.size(); i++ ) + { + + try + { + Page page = PageDB.getPagebyID( clFromPageID.get( i ) ); + if ( page.getPageNamespace() == 0 ) + { + //namespace==0 means it's a article page + outfile = new FileWriter( "/home/kasun/rdfresult/rdfoutput.txt", true ); + outfile.append( "<" + page.getPageName() + "> rdf:type <" + promintNodeName + "> \n" ); + outfile.close(); + } else + { + if ( page.getPageNamespace() == 14 ) + { + + //namespace==14 means it's a categorypage recurcive the categorypage + //recursion causes segmentation error go for only fist child + // getPagesForCategory( page.getPageName() ); + getPagesForCategoryFirstChild( page.getPageName() ); + } + } + } catch ( IOException ex ) + { + FileWriter errorfile; + try + { + errorfile = new FileWriter( "/home/kasun/rdfresult/error.txt", true ); + errorfile.append( ex.getMessage()+"\n" ); + errorfile.close(); + } catch ( IOException ex1 ) + { + Logger.getLogger( RdfGenarator.class.getName() ).log( Level.SEVERE, null, ex1 ); + } + + } + + } + + clFromPageID.clear(); + } + + public static void getPagesForCategoryFirstChild( String catName ) + { + ArrayList clFromPageID = CategoryLinksDB.getPagesLinkedByCatName( catName ); + FileWriter outfile; + + for ( int i = 0; i < clFromPageID.size(); i++ ) + { + + try + { + Page page = PageDB.getPagebyID( clFromPageID.get( i ) ); + if ( page.getPageNamespace() == 0 ) + { + //namespace==0 means it's a article page + outfile = new FileWriter( "/home/kasun/rdfresult/rdfoutput.txt", true ); + outfile.append( "<" + page.getPageName() + "> rdf:type <" + promintNodeName + "> \n" ); + outfile.close(); + } + /* + else + { + if ( page.getPageNamespace() == 14 ) + { + + //namespace==14 means it's a categorypage recurcive the categorypage + getPagesForCategory( page.getPageName() ); + } + } + * + */ + } catch ( IOException ex ) + { + FileWriter errorfile; + try + { + errorfile = new FileWriter( "/home/kasun/rdfresult/error.txt", true ); + errorfile.append( ex.getMessage()+"\n" ); + errorfile.close(); + } catch ( IOException ex1 ) + { + Logger.getLogger( RdfGenarator.class.getName() ).log( Level.SEVERE, null, ex1 ); + } + + } + + } + + clFromPageID.clear(); + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/searcher/Search.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/searcher/Search.java new file mode 100644 index 0000000..044b9d8 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/searcher/Search.java @@ -0,0 +1,267 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Jul 17, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.searcher; + + +import java.io.*; +import java.util.ArrayList; +import java.util.Date; +import java.util.LinkedList; +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.queryparser.classic.ParseException; +import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopScoreDocCollector; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.NIOFSDirectory; +import org.apache.lucene.util.Version; +import org.apache.lucene.queryparser.classic.ParseException; + +/** + * TODO- describe the purpose of the class + * + */ +public class Search +{ + + public static void searchPage( File indexDir, String q, String filed, int hitsPerPage ) + throws Exception + { + + FileWriter outFile; + //= new FileWriter("C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0.txt",true); + + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 ); + NIOFSDirectory dir = new NIOFSDirectory( indexDir ); + Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( q ); + + + + IndexReader reader = IndexReader.open( dir ); + IndexSearcher searcher = new IndexSearcher( reader ); + TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true ); + searcher.search( query, collector ); + ScoreDoc[] hits = collector.topDocs().scoreDocs; + + + System.out.println( "Found " + hits.length + " hits." ); + + + for ( int i = 0; i < hits.length; ++i ) + { + outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0.txt", true ); + + int docId = hits[i].doc; + Document d = searcher.doc( docId ); + outFile.append( d.get( "page_id" ) + "\t" + d.get( "page_namespace" ) + "\t" + d.get( "page_title" ) + "\n" ); + //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title")); + outFile.close(); + } + + } + + public static void searchCategoryLinks( File indexDir, String filed, int hitsPerPage ) + throws Exception + { + + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 ); + NIOFSDirectory dir = new NIOFSDirectory( indexDir ); + IndexReader reader = IndexReader.open( dir ); + IndexSearcher searcher = new IndexSearcher( reader ); + + + + + File pageNamespaceResultFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\pages_page_namespace_0_new.txt" ); + String line; + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( pageNamespaceResultFile ) ); + while ( ( line = fileReader.readLine() ) != null ) + { + if ( !line.isEmpty() ) + { + String[] strArr = line.split( "\\t" ); + FileWriter outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\categorylinks_match_article_pages.txt", true ); + + + TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true ); + Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( strArr[0].trim() ); + searcher.search( query, collector ); + ScoreDoc[] hits = collector.topDocs().scoreDocs; + + + System.out.println( strArr[0] + "\t" + hits.length ); + + + for ( int i = 0; i < hits.length; ++i ) + { + int docId = hits[i].doc; + Document d = searcher.doc( docId ); + outFile.append( d.get( "cl_from" ) + "\t" + d.get( "cl_to" ) + "\t" + d.get( "cl_sortkey" ) + d.get( "cl_type" ) + "\n" ); + //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title")); + } + outFile.close(); + } + } + + + } + + public static void searchCategory( File indexDir, String filed, int hitsPerPage ) + throws Exception + { + + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 ); + NIOFSDirectory dir = new NIOFSDirectory( indexDir ); + IndexReader reader = IndexReader.open( dir ); + IndexSearcher searcher = new IndexSearcher( reader ); + + + + + File uniqueCatFile = new File( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\sorted_f2_categorylinks_match_article_pages.txt" ); + String line; + BufferedReader fileReader; + fileReader = new BufferedReader( new FileReader( uniqueCatFile ) ); + while ( ( line = fileReader.readLine() ) != null ) + { + if ( !line.isEmpty() ) + { + // String[] strArr = line.split( "\\t" ); + FileWriter outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\categories_match_article_pages.txt", true ); + + + TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true ); + Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( "'" + line.trim() + "'" ); + searcher.search( query, collector ); + ScoreDoc[] hits = collector.topDocs().scoreDocs; + + if ( hits.length == 0 ) + { + System.out.println( line ); + } + + for ( int i = 0; i < hits.length; ++i ) + { + int docId = hits[i].doc; + Document d = searcher.doc( docId ); + outFile.append( d.get( "cat_id" ) + "\t" + d.get( "cat_title" ) + "\n" ); + //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title")); + } + outFile.close(); + } + } + + + } + + public static ArrayList SearchCatPageLinks( int pageID ) throws IOException, ParseException + { + + File indexDir = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\category_page_links_view" ); + String filed = "page_id"; + int hitsPerPage = 100; + + ArrayList clToResults = new ArrayList(); + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 ); + NIOFSDirectory dir = new NIOFSDirectory( indexDir ); + IndexReader reader = IndexReader.open( dir ); + IndexSearcher searcher = new IndexSearcher( reader ); + + + TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true ); + Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( "" + pageID + "" ); + searcher.search( query, collector ); + ScoreDoc[] hits = collector.topDocs().scoreDocs; + + if ( hits.length == 0 ) + { + System.out.println( pageID ); + } else + { + // System.out.println( hits.length ); + for ( int i = 0; i < hits.length; ++i ) + { + int docId = hits[i].doc; + Document d = searcher.doc( docId ); + clToResults.add( d.get( "page_title" ) ); + // outFile.append( d.get( "cat_id" ) + "\t" + d.get( "cat_title" ) + "\n" ); + //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title")); + } + } + reader.close(); + dir.close(); + return clToResults; + } + + public static LinkedList SearchCategoryPages( String pageTitle ) throws IOException, ParseException + { + + File indexDir = new File( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\index\\categoty_page_candidate_index" ); + String filed = "page_title"; + int hitsPerPage = 5; + + + + LinkedList clToResults = new LinkedList(); + WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer( Version.LUCENE_43 ); + NIOFSDirectory dir = new NIOFSDirectory( indexDir ); + IndexReader reader = IndexReader.open( dir ); + IndexSearcher searcher = new IndexSearcher( reader ); + FileWriter outFile2; + try{ + + TopScoreDocCollector collector = TopScoreDocCollector.create( hitsPerPage, true ); + Query query = new QueryParser( Version.LUCENE_43, filed, analyzer ).parse( pageTitle ); + searcher.search( query, collector ); + ScoreDoc[] hits = collector.topDocs().scoreDocs; + + if ( hits.length == 0 ) + { + System.out.println( pageTitle ); + } else + { + // System.out.println( hits.length ); + for ( int i = 0; i < hits.length; ++i ) + { + int docId = hits[i].doc; + Document d = searcher.doc( docId ); + clToResults.add( Integer.valueOf( d.get( "page_id" )) ); + // outFile.append( d.get( "cat_id" ) + "\t" + d.get( "cat_title" ) + "\n" ); + //System.out.println((i + 1) + ". " + d.get("page_id") + "\t" + d.get("page_namespace")+ "\t" + d.get("page_title")); + } + } + + return clToResults; + } + catch (ParseException e){ + outFile2 = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\data_not_inserted_node_table\\pages_can't_parse.txt", true ); + outFile2.append(pageTitle + "\n" ); + outFile2.close(); + + // System.out.println("Can't parse"+ pageTitle); + + return clToResults; + } + finally{ + + reader.close(); + dir.close(); + } + + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/ReadXMLFile.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/ReadXMLFile.java new file mode 100644 index 0000000..786b258 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/ReadXMLFile.java @@ -0,0 +1,134 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Sep 10, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.wikiquery; + + +/** + * TODO- describe the purpose of the class + * + */ +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.DocumentBuilder; +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; +import org.w3c.dom.Node; +import org.w3c.dom.Element; +import java.io.File; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; + +public class ReadXMLFile +{ + + public static void ReadFile( String filename ) + { + //public static void ReadFile(File fXmlFile) { + try + { + + File fXmlFile = new File( filename ); + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + Document doc = dBuilder.parse( fXmlFile ); + + //optional, but recommended + //read this - http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work + doc.getDocumentElement().normalize(); + + System.out.println( "Root element :" + doc.getDocumentElement().getNodeName() ); + + NodeList nList = doc.getElementsByTagName( "rev" ); + + System.out.println( "----------------------------" ); + + for ( int temp = 0; temp < nList.getLength(); temp++ ) + { + + Node nNode = nList.item( temp ); + + System.out.println( "Current Element :" + nNode.getNodeName() ); + + if ( nNode.getNodeType() == Node.ELEMENT_NODE ) + { + + + Element eElement = (Element) nNode; + + System.out.println( "Revision22222 id : " + eElement.getAttribute( "revid" ) ); +// System.out.println("First Name : " + eElement.getElementsByTagName("firstname").item(0).getTextContent()); +// System.out.println("Last Name : " + eElement.getElementsByTagName("lastname").item(0).getTextContent()); +// System.out.println("Nick Name : " + eElement.getElementsByTagName("nickname").item(0).getTextContent()); +// System.out.println("Salary : " + eElement.getElementsByTagName("salary").item(0).getTextContent()); + + } + } + } catch ( Exception e ) + { + e.printStackTrace(); + } + } + + public static int ReadFile( Document doc ,String urlParameters, String url) throws UnsupportedEncodingException + { + + int numberOfRevisions=0; + //public static void ReadFile(File fXmlFile) { + try + { + doc.getDocumentElement().normalize(); + + // System.out.println( "Root element :" + doc.getDocumentElement().getNodeName() ); + + NodeList continueNodeList = doc.getElementsByTagName( "revisions" ); + if ( continueNodeList.getLength() > 0 ) + { + Node continueNode = continueNodeList.item( 0 ); + + Element continueElement = (Element) continueNode; + // String urlParameters = "fName=" + URLEncoder.encode( "???", "UTF-8" ) + "&lName=" + URLEncoder.encode( "???", "UTF-8" ); + // String url = "http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&titles=Mother&rvlimit=max&rvstart=20130604000000&rvcontinue="+continueElement.getAttribute( "rvcontinue" ); + + // System.out.println("Calling recursive function using rivision Id "+ continueElement.getAttribute( "rvcontinue" )); + numberOfRevisions=ReadFile(RevisionHistory.excutePost( url+ "&rvcontinue="+continueElement.getAttribute( "rvcontinue" ), urlParameters ),urlParameters, url ); + + // System.out.println( "Continue revision Id : " + continueElement.getAttribute( "rvcontinue" ) ); + } + + NodeList nList = doc.getElementsByTagName( "rev" ); + + // System.out.println( "number of nodes" + nList.getLength()); +/* + for ( int temp = 0; temp < nList.getLength(); temp++ ) + { + + Node nNode = nList.item( temp ); + + // System.out.println( "\nCurrent Element :" + nNode.getNodeName() + " count: " + temp ); + + if ( nNode.getNodeType() == Node.ELEMENT_NODE ) + { + + Element eElement = (Element) nNode; + + System.out.println( "Revision id : " + eElement.getAttribute( "revid" ) ); + + } + } + */ + + return numberOfRevisions+ nList.getLength(); + } catch ( Exception e ) + { + e.printStackTrace(); + return 0; + + } + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/RevisionHistory.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/RevisionHistory.java new file mode 100644 index 0000000..58bdca9 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/RevisionHistory.java @@ -0,0 +1,110 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Sep 10, 2013 Kasun Perera Created + * + */ +package org.dbpedia.kasun.wikiquery; + + +import java.io.*; +import java.net.HttpURLConnection; +import java.net.URL; +import org.w3c.dom.Document; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; + +/** + * TODO- describe the purpose of the class + * + */ +public class RevisionHistory +{ + + // public static String excutePost( String targetURL, String urlParameters ) + public static Document excutePost( String targetURL, String urlParameters ) + { + URL url; + HttpURLConnection connection = null; + try + { + //Create connection + url = new URL( targetURL ); + connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod( "GET" ); + connection.setRequestProperty( "Accept", "application/xml" ); + + //connection.setRequestProperty( "Content-Length", ""+ Integer.toString( urlParameters.getBytes().length ) ); + // connection.setRequestProperty( "Content-Language", "en-US" ); + + connection.setUseCaches( false ); + connection.setDoInput( true ); + connection.setDoOutput( true ); + + //Send request + DataOutputStream wr = new DataOutputStream( + connection.getOutputStream() ); + wr.writeBytes( urlParameters ); + wr.flush(); + wr.close(); + + //Get Response + InputStream is = connection.getInputStream(); + + DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); +DocumentBuilder db = dbf.newDocumentBuilder(); +Document doc = (Document) db.parse(is); + + +/* + BufferedReader rd = new BufferedReader( new InputStreamReader( is ) ); + String line; + + + // Create temp file. + File temp = File.createTempFile( "pattern", ".xml" ); + + // Delete temp file when program exits. + temp.deleteOnExit(); + + // Write to temp file + BufferedWriter out = new BufferedWriter( new FileWriter( temp ) ); + + + + StringBuffer response = new StringBuffer(); + while ( ( line = rd.readLine() ) != null ) + { + out.write( line + "\n" ); + + System.out.println( line + "\n" ); + response.append( line + "\n" ); + // response.append( '\r' ); + } + rd.close(); + out.close(); + + + */ + return doc; + // return response.toString(); + + } catch ( Exception e ) + { + + e.printStackTrace(); + return null; + + } finally + { + + if ( connection != null ) + { + connection.disconnect(); + } + } + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/WikiQuery.java b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/WikiQuery.java new file mode 100644 index 0000000..68841c6 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/dbpedia/kasun/wikiquery/WikiQuery.java @@ -0,0 +1,49 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ + + + +/** + * KarshaAnnotate- Annotation tool for financial documents + * + * + * Date Author Changes + * Sep 10, 2013 Kasun Perera Created + * + */ + +package org.dbpedia.kasun.wikiquery; + + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; + + + +/** + * TODO- describe the purpose of the class + * + */ +public class WikiQuery { + + public static void main(String[] args ) throws UnsupportedEncodingException { + + int pageId=83430; + + String urlParameters = "fName=" + URLEncoder.encode("???", "UTF-8") + "&lName=" + URLEncoder.encode("???", "UTF-8"); + //timestamp June 4th, 2013 00:00:00 UTC=20130604000000 + // String url="http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&titles=Mother&rvlimit=max&rvstart=20130604000000"; + String url="http://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&pageids="+pageId+"&rvlimit=max&rvstart=20130604000000"; + + //pageid + // RevisionHistory.excutePost( url, urlParameters ); + // ReadXMLFile.ReadFile( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\WikipediaCategoryProcessor\\api.xml"); + int totalRevisions= ReadXMLFile.ReadFile(RevisionHistory.excutePost( url, urlParameters ),urlParameters,url); + System.out.println("totalRevisions "+ totalRevisions); + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java new file mode 100644 index 0000000..d59d5b5 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/D.java @@ -0,0 +1,417 @@ +package org.yago.javatools.administrative; + +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.Writer; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.Comparator; +import java.util.EnumSet; +import java.util.List; +import java.util.Map; + +import org.yago.javatools.parsers.Char; + +/** + This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). + It is licensed under the Creative Commons Attribution License + (see http://creativecommons.org/licenses/by/3.0) by + the YAGO-NAGA team (see http://mpii.de/yago-naga). + + + + + + This class provides convenience methods for Input/Output. + Allows to do basic I/O with easy procedure calls + -- nearly like in normal programming languages. + Furthermore, the class provides basic set operations for EnumSets, NULL-safe + comparisons and adding to maps.
+ Example: +
+ D.p("This is an easy way to write a string");
+ // And this is an easy way to read one:
+ String s=D.r();
+ 
+ // Here is a cool way to print something inline
+ computeProduct(factor1,(Integer)D.p(factor2));
+ 
+ // Here are some tricks with enums
+ enum T {a,b,c};
+ EnumSet<T> i=D.intersection(EnumSet.of(T.a,T.b),EnumSet.of(T.b,T.c));
+ EnumSet<T> u=D.union(EnumSet.of(T.a,T.b),EnumSet.of(T.b,T.c));
+ 
+ // Here is how to compare things, even if they are NULL
+ D.compare(object1, object2);
+ 
+ // Here is how to add something to maps that contain lists
+ Map<String,List<String>> string2list=new TreeMap<String,List<String>>();
+ D.addKeyValue(string2list,"key","new list element",ArrayList.class); 
+ // now, the map contains "key" -> [ "new list element" ]
+ D.addKeyValue(string2list,"key","again a new list element",ArrayList.class);
+ // now, the map contains "key" -> [ "new list element", "again a new list element" ]  
+
+ // Here is how to add something to maps that contain integers
+ Map<String,Integer> string2list=new TreeMap<String,Integer>();
+ D.addKeyValue(string2list,"key",7); // map now contains "key" -> 7
+ D.addKeyValue(string2list,"key",3); // map now contains "key" -> 10
+
+ 
+ */ +public class D { + + /** Indentation margin. All methods indent their output by indent spaces */ + public static int indent = 0; + + /** Prints spaces */ + protected static void i() { + for (int i = 0; i < indent; i++) + System.out.print(" "); + } + + /** Prints some Objects, returns them */ + public static Object p(Object... a) { + pl(a); + if (a == null || a.length == 0) return (null); + if (a.length == 1) return (a[0]); + return (a); + } + + /** Prints some Objects */ + public static Object println(Object... a) { + return (p(a)); + } + + + + /** Prints some Objects on one line */ + public static void pl(Object... a) { + //System.out.print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@"); + System.out.print(toString(a)); + } + + /** Prints an array of integers*/ + public static int[] p(int[] a) { + i(); + if (a == null) System.out.print("null-array"); + else for (int i = 0; i < a.length; i++) + System.out.print(a[i] + ", "); + System.out.println(""); + return (a); + } + + /** Prints an array of doubles*/ + public static double[] p(double[] a) { + i(); + if (a == null) System.out.print("null-array"); + else for (int i = 0; i < a.length; i++) + System.out.print(a[i] + ", "); + System.out.println(""); + return (a); + } + + /** Reads a line from the keyboard */ + public static String r() { + String s = ""; + i(); + try { + s = new BufferedReader(new InputStreamReader(System.in)).readLine(); + } catch (Exception whocares) { + } + return (s); + } + + /** Reads a line from the keyboard */ + public static String read() { + return (r()); + } + + /** Reads a long from the keyboard */ + public static String read(String question) { + System.out.print(question+" "); + return (D.read()); + } + + /** Reads a long from the keyboard */ + public static boolean readBoolean(String question) { + System.out.print(question+" "); + return (D.read().startsWith("y")); + } + + /** Reads a long from the keyboard */ + public static long readLong(String question) { + System.out.print(question); + return (Long.parseLong(D.r())); + } + + /** Reads a double from the keyboard */ + public static double readDouble(String question) { + System.out.print(question); + return (Double.parseDouble(D.r())); + } + + /** Waits for a number of milliseconds */ + public static void waitMS(long milliseconds) { + try { + Thread.sleep(milliseconds); + } catch (InterruptedException ex) { + } + } + + /** Returns the intersection of two enumsets */ + public static > EnumSet intersection(EnumSet s1, EnumSet s2) { + // We have to clone, since retainAll modifies the set + EnumSet s = s1.clone(); + s.retainAll(s2); + // I tried coding this for arbitrary sets, but it failed because + // the interface Cloneable does not make sure that the clone-method + // is visible (!) + return (s); + } + + /** Returns the union of two enumsets */ + public static > EnumSet union(EnumSet s1, EnumSet s2) { + EnumSet s = s1.clone(); + s.addAll(s2); + return (s); + } + + /** Tells whether the intersection is non-empty */ + public static > boolean containsOneOf(EnumSet s1, EnumSet s2) { + return (!intersection(s1, s2).isEmpty()); + } + + /** Exits with error code 0 */ + public static void exit() { + System.exit(0); + } + + /** Writes a line to a writer. Yes, this is possible */ + public static void writeln(Writer out, Object s) throws IOException { + out.write(s.toString()); + out.write("\n"); + } + + /** Writes a line to a writer. Yes, this is possible */ + public static void writeln(OutputStream out, Object s) throws IOException { + String string = Char.encodeUTF8(s.toString()); + for (int i = 0; i < string.length(); i++) + out.write(string.charAt(i)); + out.write('\n'); + } + + /** Writes a line silently to a writer. */ + public static void silentWriteln(Writer out, Object s) { + try { + out.write(s.toString()); + out.write("\n"); + } catch (Exception e) { + } + } + + /** Executes a command */ + public static void execute(String cmd, File folder) throws Exception { + Process p = Runtime.getRuntime().exec(cmd, null, folder); + BufferedReader bri = new BufferedReader(new InputStreamReader(p.getInputStream())); + BufferedReader bre = new BufferedReader(new InputStreamReader(p.getErrorStream())); + String s1, s2 = null; + while (null != (s1 = bri.readLine()) || null != (s2 = bre.readLine())) { + if (s1 != null) System.out.println(s1); + if (s2 != null) System.err.println(s2); + } + p.waitFor(); + } + + /** Given a map that maps to collections, adds a new key/value pair or introduces the key*/ + @SuppressWarnings({ "unchecked", "rawtypes" }) + public static , L extends Collection> void addKeyValue(Map map, K key, V value, Class collectionType) { + C coll = map.get(key); + if (coll == null) { + try { + map.put(key, coll = (C) collectionType.newInstance()); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + coll.add(value); + } + + /** Given a map that maps to collections, adds a new key/value pair or introduces the key*/ + @SuppressWarnings({ "rawtypes" }) + public static , L extends Collection> void addKeyValues(Map map, K key, C values, Class collectionType) { + for(V val : values) addKeyValue(map,key,val,collectionType); + } + + /** Given a map that maps to integers, adds a new key/value pair or increases the counter*/ + public static void addKeyValue(Map map, K key, int value) { + Integer coll = map.get(key); + if (coll == null) { + map.put(key, value); + return; + + } + map.put(key, coll + value); + } + + /** Given a map that maps to floats, adds a new key/value pair or increases the counter*/ + public static void addKeyValueFlt(Map map, K key, float value) { + Float coll = map.get(key); + if (coll == null) { + map.put(key, value); + return; + } + map.put(key, coll + value); + } + + /** Given a map that maps to doubles, adds a new key/value pair or increases the counter*/ + public static void addKeyValueDbl(Map map, K key, double value) { + Double coll = map.get(key); + if (coll == null) { + map.put(key, value); + return; + + } + map.put(key, coll + value); + } + + /** Given a map that maps to comparable objects, sets a key to a given value iff the current value is null or smaller than the given value*/ + public static > void setKeyValueIfGreaterThanCurrent(Map map, K key, V value) { + V coll = map.get(key); + if (coll == null) { + map.put(key, value); + return; + } + if(coll.compareTo(value)<0) + map.put(key, value); + } + + + /** Returns the element of a map or 0*/ + public static int getOrZero(Map map, K key) { + Integer i = map.get(key); + if (i == null) return (0); + return (i); + } + + /** Returns the element of a map or 0*/ + public static double getOrZeroDouble(Map map, K key) { + Double i = map.get(key); + if (i == null) return (0); + return (i); + } + + /** Returns the element of a map or a default value*/ + public static V getOr(Map map, K key, V defValue) { + V i = map.get(key); + if (i == null) return defValue; + return (i); + } + + /** Returns a sorted list of the items*/ + public static List sorted(final Map map) { + List list=new ArrayList(map.keySet()); + Collections.sort(list,new Comparator(){ + + @Override + public int compare(T arg0, T arg1) { + return (map.get(arg1).compareTo(map.get(arg0))); + }}); + return(list); + } + + /** Returns a sorted list of the items*/ + public static List sortedDouble(final Map map) { + List list=new ArrayList(map.keySet()); + Collections.sort(list,new Comparator(){ + + @Override + public int compare(T arg0, T arg1) { + return (map.get(arg1).compareTo(map.get(arg0))); + }}); + return(list); + } + + /** Returns true if two things are equal, including NULL */ + public static boolean equal(E s1, E s2) { + if (s1 == s2) return (true); + if (s1 == null) return (false); + if (s2 == null) return (false); + return (s1.equals(s2)); + } + + /** Compares two things, including NULL */ + public static > int compare(E s1, E s2) { + if (s1 == s2) return (0); + if (s1 == null) return (-1); + if (s2 == null) return (1); + return (s1.compareTo(s2)); + } + + /** Compares pairs of comparable things (a1,a2,b1,b2,...), including NULL */ + @SuppressWarnings("unchecked") + public static int comparePairs(Object... o) { + for (int i = 0; i < o.length; i += 2) { + int c = compare((Comparable) o[i], (Comparable) o[i + 1]); + if (c != 0) return (c); + } + return (0); + } + + /** Compares pairs of comparable things (a1,a2,b1,b2,...) for equality, including NULL */ + public static boolean equalPairs(Object... o) { + for (int i = 0; i < o.length; i += 2) { + if (!equal(o[i], o[i + 1])) return (false); + } + return (true); + } + + /** Returns the index of a thing in an array or -1*/ + public static int indexOf(Object o, Object... os) { + for (int i = 0; i < os.length; i++) { + if (D.equal(os[i], o)) return (i); + } + return (-1); + } + + /** TRUE if the first enum is before the second*/ + public static > boolean smaller(Enum e1, Enum e2) { + return (e1.ordinal() < e2.ordinal()); + } + + /** Returns a reasonable String representation of a sequence of things. Handles arrays, deep arrays and NULL.*/ + public static String toString(Object... o) { + if (o == null) { + return ("null"); + } + StringBuilder b = new StringBuilder(); + for (int i = 0; i < o.length; i++) { + if (o[i] == null) { + b.append("null"); + continue; + } + if (o[i].getClass().isArray()) { + b.append("["); + if (((Object[]) o[i]).length != 0) { + for (Object obj : (Object[]) o[i]) { + b.append(toString(obj)).append(", "); + } + } + b.append("]"); + } else { + b.append(o[i].toString()); + } + if (i != o.length - 1) b.append(" "); + } + return (b.toString()); + } + + /** Picks one element from a set or NULL*/ + public static T pick(Collection set) { + if(set.isEmpty()) return(null); + return(set.iterator().next()); + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/Elements.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/Elements.java new file mode 100644 index 0000000..79947de --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/administrative/Elements.java @@ -0,0 +1,64 @@ +/* + * To change this template, choose Tools | Templates + * and open the template in the editor. + */ +/** + * + * Date Author Changes Sep 4, 2013 Kasun Perera Created + * + */ +package org.yago.javatools.administrative; + + +import org.yago.javatools.parsers.NounGroup; + +/** + * TODO- describe the purpose of the class + * + */ +public class Elements +{ + + public static void main( String[] args ) throws Exception + { + + System.out.println( getHead( "booooooooo" ) ); + + } + + public static String getHead( String category ) + { + + String elementList[] = splitObject( new NounGroup( category ).description() ); + if ( elementList == null || elementList.length == 0 ) + { + return ( null ); + } + /* + * lelemnts of the elementList + * [0]"NounGroup: + * [1]Original: "+original+" + * [2]Stemmed: "+stemmed()+" + * [3]Determiner: "+determiner+" + * [4]preModifiers: "+preModifier+" + * [5]Head: "+head+" + * [6]Adjective:"+adjective+" + * [7]Preposition: "+preposition+" + * [8]postModifier:\n"+(postModifier==null?"":postModifier.description())); + * + */ + String head[] = elementList[5].split( ":" ); + if(head.length<1){ + return (null); + } + + return (head[1].trim()); + } + + public static String[] splitObject( Object... a ) + { + String objectlist[] = D.toString( a ).split( "\\n" ); + + return objectlist; + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalMap.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalMap.java new file mode 100644 index 0000000..c13c10e --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalMap.java @@ -0,0 +1,41 @@ +package org.yago.javatools.datatypes; +import java.util.TreeMap; + +/** +This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). +It is licensed under the Creative Commons Attribution License +(see http://creativecommons.org/licenses/by/3.0) by +the YAGO-NAGA team (see http://mpii.de/yago-naga). + + + + + +Provides a nicer constructor for a TreeMap. +Example: +
+   FinalMap f=new FinalMap(
+     "a",1,
+     "b",2,
+     "c",3);
+   System.out.println(f.get("b"));
+   --> 2
+
+*/ +public class FinalMap extends TreeMap{ + /** Constructs a FinalMap from an array that contains key/value sequences */ + @SuppressWarnings("unchecked") + public FinalMap(Object... a) { + super(); + for(int i=0;i f=new FinalMap("a",1,"b",2); + System.out.println(f.get("b")); + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalSet.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalSet.java new file mode 100644 index 0000000..0a27044 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/datatypes/FinalSet.java @@ -0,0 +1,73 @@ +package org.yago.javatools.datatypes; +import java.util.AbstractList; +import java.util.Arrays; +import java.util.Set; + +import org.yago.javatools.administrative.D; +/** +This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). +It is licensed under the Creative Commons Attribution License +(see http://creativecommons.org/licenses/by/3.0) by +the YAGO-NAGA team (see http://mpii.de/yago-naga). + + + + + +This class provides a very simple container implementation with zero overhead. +A FinalSet bases on a sorted, unmodifiable array. The constructor +can either be called with a sorted unmodifiable array (default constructor) +or with an array that can be cloned and sorted beforehand if desired. +Example: +
+   FinalSet f=new FinalSet("a","b","c");
+   // equivalently: 
+   //   FinalSet f=new FinalSet(new String[]{"a","b","c"});
+   //   FinalSet f=new FinalSet(SHALLNOTBECLONED,ISSORTED,"a","b","c");
+   System.out.println(f.get(1));
+   --> b
+
+*/ +public class FinalSet extends AbstractList implements Set{ + /** Holds the data, must be sorted */ + public T[] data; + /** Constructs a FinalSet from an array, clones and sorts the array if indicated. */ + @SuppressWarnings("unchecked") + public FinalSet(boolean clone,T... a) { + if(clone) { + Comparable[] b=new Comparable[a.length]; + System.arraycopy(a,0,b,0,a.length); + a=(T[])b; + } + Arrays.sort(a); + data=a; + } + /** Constructs a FinalSet from an array that does not need to be cloned */ + public FinalSet(T... a) { + this(false,a); + } + /** Tells whether x is in the container */ + public boolean contains(T x) { + return(Arrays.binarySearch(data,x)>=0); + } + /** Returns the position in the array or -1 */ + public int indexOf(T x) { + int r=Arrays.binarySearch(data,x); + return(r>=0?r:-1); + } + /** Returns the element at position i*/ + public T get(int i) { + return(data[i]); + } + + /** Returns the number of elements in this FinalSet */ + public int size() { + return(data.length); + } + + /** Test routine */ + public static void main(String[] args) { + FinalSet f=new FinalSet("b","a","c"); + D.p(f.get(1)); + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/Char.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/Char.java new file mode 100644 index 0000000..0abf63a --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/Char.java @@ -0,0 +1,1404 @@ +package org.yago.javatools.parsers; + +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.Map; +import java.util.TreeMap; + +import org.yago.javatools.datatypes.FinalMap; + +/** +This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). +It is licensed under the Creative Commons Attribution License +(see http://creativecommons.org/licenses/by/3.0) by +the YAGO-NAGA team (see http://mpii.de/yago-naga). + + + + + This class provides static methods to decode, encode and normalize Strings.
+ Decoding converts the following codes to Java 16-bit characters (char): +
    +
  • all HTML ampersand codes (like &nbsp;) as specified by the W3C +
  • all backslash codes (like \ b) as specified by the Java language specification +
  • all percentage codes (like %2C) as used in URLs and E-Mails +
  • all UTF-8 codes (like Ä«) as specified in Wikipedia +
+

+ Encoding is the inverse operation. It takes a Java 16-bit character (char) and + outputs its encoding in HTML, as a backslash code, as a percentage code or in UTF8. +

+ Normalization converts the following Unicode characters (Java 16-bit chars) + to ASCII-characters in the range 0x20-0x7F: +

    +
  • all ASCII control characters (0x00-0x1F) +
  • all Latin-1 characters (0x80-0xFF) to the closest transliteration +
  • all Latin Extended-A characters (0x100-0x17F) to the closest transliteration +
  • all Greek characters (0x374-0x3D6) to the closest transliteration as specified in Wikipedia +
  • all General-Punctuation characters (0x2016-0x2055) to the closest ASCII punctuation +
  • most mathematical symbols (in the range of 0x2000) to the common program code identifier or text +
  • all ligatures (0xFB00-0xFB06, the nasty things you get when you copy/paste from PDFs) to + the separate characters +
+

+

Usage

+

+ Decoding is done by methods that "eat" a code from the string. + They require as an additional parameter an integer array of length 1, + in which they store the length of the code that they chopped off.
+ Example: +

+     int[] eatLength=new int[1];
+     char c=eatPercentage("%2Cblah blah",eatLength);
+     -->  c=','
+          eatLength[0]=3  // the code was 3 characters long
+  
+ There is a static integer array Char.eatLength, which you can use for this purpose. + The methods store 0 in case the String does not start with the correct code. + They store -1 in case the String starts with a corrupted code. Of course, you can + use the eat... methods also to decode one single code. There are methods + decode... that decode the percentage code, the UTF8-codes, the backslash codes + or the Ampersand codes, respectively. + The method decode(String) decodes all codes of a String.
+ Example: +
+     decode("This String contains some codes: &amp; %2C \ u0041");
+     --> "This String contains some codes: & , A"
+  
+

+ Normalization is done by the method normalize(int c). It converts a Unicode + character (a 16-bit Java character char) + to a sequence of normal characters (i.e. characters in the range 0x20-0x7F). + The transliteration may consist of multiple chars (e.g. for umlauts) and also of no + chars at all (e.g. for Unicode Zero-Space-Characters).
+ Example: +

+    normalize('ä');
+    --> "ae"
+  
+ The method normalize(String) normalizes all characters in a String.
+ Example: +
+     normalize("This String contains the umlauts �, � and �");
+     -->  "This String contains the umlauts Ae, Oe and Ue"
+  
+ If the method cannot find a normalization, it calls defaultNormalizer.apply(char c). + Decoding and normalizing can be combined by the method decodeAndNormalize(String s). +

+ Encoding is done by methods called encode...(char). These methods take a character + and transform it to a UTF8 code, a percentage code, an ampersand code or a backslash code, + respectively. If the character is normal (i.e. in the range 0x20-0x7F), they simply return the input + character without any change.
+ Example: +

+     encodePercentage('�');
+     -->  "%C4"
+  
+ There are also methods that work on entire Strings
+ Example: +
+     encodePercentage("This String contains the umlauts �, � and �");
+     -->  "This String contains the umlauts %C4, %D6 and %DC;"
+  
+

+ Last, this class provides the character categorization for URIs, as given in + http://tools.ietf.org/html/rfc3986 . It also provides a method to encode only those + characters that are not valid path component characters
+ Example: +

+     isReserved(';');
+     -->  true
+     encodeURIPathComponent("a: b")
+     -->  "a:%20b"
+  
+ */ +public class Char { + + /** Defines just one function from an int to a String */ + public interface Char2StringFn { + + /** Function from a char to a String */ + String apply(char c); + } + + /** Called by normalize(int) in case the character cannot be normalized. + * The default implementation returns UNKNOWN. + * Feel free to create a new Char2StringFn and assign it to defaultNormalizer. */ + public static Char2StringFn defaultNormalizer = new Char2StringFn() { + + public String apply(char c) { + return (UNKNOWN); + } + }; + + /** String returned by the default implementation of defaultNormalizer, "[?]"*/ + public static String UNKNOWN = "[?]"; + + /** Maps a special character to a HTML ampersand sequence */ + public static Map charToAmpersand = new FinalMap('&', "&", '\'', "'", '<', "<", '>', ">", '"', """); + + /** Maps HTML ampersand sequences to strings */ + public static Map ampersandMap = new FinalMap("nbsp", (char) 160, "iexcl", (char) 161, "cent", (char) 162, "pound", (char) 163, "curren", (char) 164, "yen", (char) 165, "brvbar", (char) 166, "sect", + (char) 167, "uml", (char) 168, "copy", (char) 169, "ordf", (char) 170, "laquo", (char) 171, "not", (char) 172, "shy", (char) 173, "reg", (char) 174, "macr", (char) 175, "deg", (char) 176, "plusmn", (char) 177, "sup2", (char) 178, "sup3", + (char) 179, "acute", (char) 180, "micro", (char) 181, "para", (char) 182, "middot", (char) 183, "cedil", (char) 184, "sup1", (char) 185, "ordm", (char) 186, "raquo", (char) 187, "frac14", (char) 188, "frac12", (char) 189, "frac34", (char) 190, + "iquest", (char) 191, "Agrave", (char) 192, "Aacute", (char) 193, "Acirc", (char) 194, "Atilde", (char) 195, "Auml", (char) 196, "Aring", (char) 197, "AElig", (char) 198, "Ccedil", (char) 199, "Egrave", (char) 200, "Eacute", (char) 201, + "Ecirc", (char) 202, "Euml", (char) 203, "Igrave", (char) 204, "Iacute", (char) 205, "Icirc", (char) 206, "Iuml", (char) 207, "ETH", (char) 208, "Ntilde", (char) 209, "Ograve", (char) 210, "Oacute", (char) 211, "Ocirc", (char) 212, "Otilde", + (char) 213, "Ouml", (char) 214, "times", (char) 215, "Oslash", (char) 216, "Ugrave", (char) 217, "Uacute", (char) 218, "Ucirc", (char) 219, "Uuml", (char) 220, "Yacute", (char) 221, "THORN", (char) 222, "szlig", (char) 223, "agrave", + (char) 224, "aacute", (char) 225, "acirc", (char) 226, "atilde", (char) 227, "auml", (char) 228, "aring", (char) 229, "aelig", (char) 230, "ccedil", (char) 231, "egrave", (char) 232, "eacute", (char) 233, "ecirc", (char) 234, "euml", + (char) 235, "igrave", (char) 236, "iacute", (char) 237, "icirc", (char) 238, "iuml", (char) 239, "eth", (char) 240, "ntilde", (char) 241, "ograve", (char) 242, "oacute", (char) 243, "ocirc", (char) 244, "otilde", (char) 245, "ouml", + (char) 246, "divide", (char) 247, "oslash", (char) 248, "ugrave", (char) 249, "uacute", (char) 250, "ucirc", (char) 251, "uuml", (char) 252, "yacute", (char) 253, "thorn", (char) 254, "yuml", (char) 255, "fnof", (char) 402, "Alpha", + (char) 913, "Beta", (char) 914, "Gamma", (char) 915, "Delta", (char) 916, "Epsilon", (char) 917, "Zeta", (char) 918, "Eta", (char) 919, "Theta", (char) 920, "Iota", (char) 921, "Kappa", (char) 922, "Lambda", (char) 923, "Mu", (char) 924, "Nu", + (char) 925, "Xi", (char) 926, "Omicron", (char) 927, "Pi", (char) 928, "Rho", (char) 929, "Sigma", (char) 931, "Tau", (char) 932, "Upsilon", (char) 933, "Phi", (char) 934, "Chi", (char) 935, "Psi", (char) 936, "Omega", (char) 937, "alpha", + (char) 945, "beta", (char) 946, "gamma", (char) 947, "delta", (char) 948, "epsilon", (char) 949, "zeta", (char) 950, "eta", (char) 951, "theta", (char) 952, "iota", (char) 953, "kappa", (char) 954, "lambda", (char) 955, "mu", (char) 956, "nu", + (char) 957, "xi", (char) 958, "omicron", (char) 959, "pi", (char) 960, "rho", (char) 961, "sigmaf", (char) 962, "sigma", (char) 963, "tau", (char) 964, "upsilon", (char) 965, "phi", (char) 966, "chi", (char) 967, "psi", (char) 968, "omega", + (char) 969, "thetasym", (char) 977, "upsih", (char) 978, "piv", (char) 982, "bull", (char) 8226, "hellip", (char) 8230, "prime", (char) 8242, "Prime", (char) 8243, "oline", (char) 8254, "frasl", (char) 8260, "weierp", (char) 8472, "image", + (char) 8465, "real", (char) 8476, "trade", (char) 8482, "alefsym", (char) 8501, "larr", (char) 8592, "uarr", (char) 8593, "rarr", (char) 8594, "darr", (char) 8595, "harr", (char) 8596, "crarr", (char) 8629, "lArr", (char) 8656, "uArr", + (char) 8657, "rArr", (char) 8658, "dArr", (char) 8659, "hArr", (char) 8660, "forall", (char) 8704, "part", (char) 8706, "exist", (char) 8707, "empty", (char) 8709, "nabla", (char) 8711, "isin", (char) 8712, "notin", (char) 8713, "ni", + (char) 8715, "prod", (char) 8719, "sum", (char) 8721, "minus", (char) 8722, "lowast", (char) 8727, "radic", (char) 8730, "prop", (char) 8733, "infin", (char) 8734, "ang", (char) 8736, "and", (char) 8743, "or", (char) 8744, "cap", (char) 8745, + "cup", (char) 8746, "int", (char) 8747, "there4", (char) 8756, "sim", (char) 8764, "cong", (char) 8773, "asymp", (char) 8776, "ne", (char) 8800, "equiv", (char) 8801, "le", (char) 8804, "ge", (char) 8805, "sub", (char) 8834, "sup", + (char) 8835, "nsub", (char) 8836, "sube", (char) 8838, "supe", (char) 8839, "oplus", (char) 8853, "otimes", (char) 8855, "perp", (char) 8869, "sdot", (char) 8901, "lceil", (char) 8968, "rceil", (char) 8969, "lfloor", (char) 8970, "rfloor", + (char) 8971, "lang", (char) 9001, "rang", (char) 9002, "loz", (char) 9674, "spades", (char) 9824, "clubs", (char) 9827, "hearts", (char) 9829, "diams", (char) 9830, "quot", (char) 34, "amp", (char) 38, "lt", (char) 60, "gt", (char) 62, + "OElig", (char) 338, "oelig", (char) 339, "Scaron", (char) 352, "scaron", (char) 353, "Yuml", (char) 376, "circ", (char) 710, "tilde", (char) 732, "ensp", (char) 8194, "emsp", (char) 8195, "thinsp", (char) 8201, "zwnj", (char) 8204, "zwj", + (char) 8205, "lrm", (char) 8206, "rlm", (char) 8207, "ndash", + (char) 8211, //0x2013 + "mdash", (char) 8212, "lsquo", (char) 8216, "rsquo", (char) 8217, "sbquo", (char) 8218, "ldquo", (char) 8220, "rdquo", (char) 8221, "bdquo", (char) 8222, "dagger", (char) 8224, "Dagger", (char) 8225, "permil", (char) 8240, "lsaquo", + (char) 8249, "rsaquo", (char) 8250, "euro", (char) 8364, "apos", '\''); + + /** Maps characters to normalizations */ + public static Map normalizeMap = new TreeMap(); + static { + Object[] o = new Object[] { + // ASCII + (char) 7, + "BEEP", + (char) 9, + " ", + (char) 10, + "\n", + + // Latin-1 + (char) 160, + " ", + (char) 161, + "!", + (char) 162, + "cent", + (char) 163, + "pound", + (char) 164, + "currency", + (char) 165, + "yen", + (char) 166, + "|", + (char) 167, + "/", + (char) 169, + "(c)", + (char) 170, + "^a", + (char) 171, + "\"", + (char) 172, + "~", + (char) 173, + "", + (char) 174, + "(R)", + (char) 176, + "degree", + (char) 177, + "+/-", + (char) 178, + "^2", + (char) 179, + "^3", + (char) 180, + "'", + (char) 181, + "mu", + (char) 182, + "P", + (char) 183, + ".", + (char) 184, + ",", + (char) 185, + "^1", + (char) 186, + "^o", + (char) 187, + "\"", + (char) 188, + "1/4", + (char) 189, + "1/2", + (char) 190, + "3/4", + (char) 191, + "?", + (char) 0xC4, + "Ae", + (char) 0xD6, + "Oe", + (char) 0xDC, + "Ue", + (char) 0xDF, + "ss", + (char) 0xC6, + "Ae", + (char) 0xC7, + "C", + (char) 0xD0, + "D", + (char) 0xD1, + "N", + (char) 0xD7, + "x", + (char) 0xDD, + "Y", + (char) 0xDE, + "b", + (char) 0xF7, + "/", + (char) 0xFF, + "y", + + // Latin Extended-A + (char) 0x132, + "IJ", + (char) 0x134, + "J", + (char) 0x170, + "Ue", + (char) 0x174, + "W", + (char) 0x17F, + "f", + + // Greek + (char) 0x374, + "'", + (char) 0x375, + ",", + (char) 0x37A, + ",", + (char) 0x37E, + ";", + (char) 0x384, + "'", + (char) 0x385, + "'", + (char) 0x386, + "A", + (char) 0x387, + ".", + (char) 0x388, + "E", + (char) 0x380, + "I", + (char) 0x38C, + "O", + (char) 0x38E, + "Y", + (char) 0x38F, + "O", + (char) 0x390, + "i", + (char) 215, + "*", + (char) 913, + "A", + (char) 914, + "B", + (char) 915, + "G", + (char) 916, + "D", + (char) 917, + "E", + (char) 918, + "Z", + (char) 919, + "E", + (char) 920, + "Th", + (char) 921, + "I", + (char) 922, + "K", + (char) 923, + "L", + (char) 924, + "M", + (char) 925, + "N", + (char) 926, + "X", + (char) 927, + "O", + (char) 928, + "P", + (char) 929, + "R", + (char) 931, + "S", + (char) 932, + "T", + (char) 933, + "Y", + (char) 934, + "Ph", + (char) 935, + "Ch", + (char) 936, + "Ps", + (char) 937, + "O", + (char) 977, + "th", + (char) 978, + "y", + (char) 982, + "pi", + + // General Punctuation + (char) 0x2013, + "-", + (char) 0x2016, + "||", + (char) 0x2017, + "_", + (char) 0x2020, + "+", + (char) 0x2021, + "++", + (char) 0x2022, + "*", + (char) 0x2023, + "*", + (char) 0x2024, + ".", + (char) 0x2025, + "..", + (char) 0x2026, + "...", + (char) 0x2027, + ".", + (char) 0x2028, + "\n", + (char) 0x2030, + "/1000", + (char) 0x2031, + "/10000", + (char) 0x2032, + "'", + (char) 0x2033, + "''", + (char) 0x2034, + "'''", + (char) 0x2035, + "'", + (char) 0x2036, + "''", + (char) 0x2037, + "'''", + (char) 0x2038, + "^", + (char) 0x2039, + "\"", + (char) 0x203A, + "\"", + (char) 0x203B, + "*", + (char) 0x203C, + "!!", + (char) 0x203D, + "?!", + (char) 0x2041, + ",", + (char) 0x2042, + "***", + (char) 0x2043, + "-", + (char) 0x2044, + "/", + (char) 0x2045, + "[", + (char) 0x2046, + "]", + (char) 0x2047, + "??", + (char) 0x2048, + "?!", + (char) 0x2049, + "!?", + (char) 0x204A, + "-", + (char) 0x204B, + "P", + (char) 0x204C, + "<", + (char) 0x204D, + ">", + (char) 0x204F, + ";", + (char) 0x2050, + "-", + (char) 0x2051, + "**", + (char) 0x2052, + "./.", + (char) 0x2053, + "~", + (char) 0x2054, + "_", + (char) 0x2055, + "_", + + // Mathematical symbols + (char) 8465, + "I", + (char) 8476, + "R", + (char) 8482, + "(TM)", + (char) 8501, + "a", + (char) 8592, + "<-", + (char) 8593, + "^", + (char) 8594, + "->", + (char) 8595, + "v", + (char) 8596, + "<->", + (char) 8629, + "<-'", + (char) 8656, + "<=", + (char) 8657, + "^", + (char) 8658, + "=>", + (char) 8659, + "v", + (char) 8660, + "<=>", + (char) 8704, + "FOR ALL", + (char) 8706, + "d", + (char) 8707, + "EXIST", + (char) 8709, + "{}", + (char) 8712, + "IN", + (char) 8713, + "NOT IN", + (char) 8715, + "CONTAINS", + (char) 8719, + "PRODUCT", + (char) 8721, + "SUM", + (char) 8722, + "-", + (char) 8727, + "*", + (char) 8730, + "SQRT", + (char) 8733, + "~", + (char) 8734, + "INF", + (char) 8736, + "angle", + (char) 8743, + "&", + (char) 8744, + "|", + (char) 8745, + "INTERSECTION", + (char) 8746, + "UNION", + (char) 8747, + "INTEGRAL", + (char) 8756, + "=>", + (char) 8764, + "~", + (char) 8773, + "~=", + (char) 8776, + "~=", + (char) 8800, + "!=", + (char) 8801, + "==", + (char) 8804, + "=<", + (char) 8805, + ">=", + (char) 8834, + "SUBSET OF", + (char) 8835, + "SUPERSET OF", + (char) 8836, + "NOT SUBSET OF", + (char) 8838, + "SUBSET OR EQUAL", + (char) 8839, + "SUPERSET OR EQUAL", + (char) 8853, + "(+)", + (char) 8855, + "(*)", + (char) 8869, + "_|_", + (char) 8901, + "*", + (char) 8364, + "EUR", + + // Ligatures + (char) 0xFB00, + "ff", + (char) 0xFB01, + "fi", + (char) 0xFB02, + "fl", + (char) 0xFB03, + "ffi", + (char) 0xFB04, + "ffl", + (char) 0xFB05, + "ft", + (char) 0xFB06, + "st" }; + for (int i = 0; i < o.length; i += 2) + normalizeMap.put((Character) o[i], (String) o[i + 1]); + } + + /** Normalizes a character to a String of characters in the range 0x20-0x7F. + * Returns a String, because some characters are + * normalized to multiple characters (e.g. umlauts) and + * some characters are normalized to zero characters (e.g. special Unicode space chars). + * Returns null for the EndOfFile character -1 */ + public static String normalize(int c) { + // EOF + if (c == -1) return (null); + + // ASCII chars + if (c >= ' ' && c <= 128) return ("" + (char) c); + + // Upper case + boolean u = Character.isUpperCase(c); + char cu = (char) Character.toUpperCase(c); + + // Check map + if (normalizeMap.get(cu) != null) return (u ? normalizeMap.get(cu) : normalizeMap.get(cu).toLowerCase()); + + // ASCII + if (c < ' ') return (""); + + // Latin-1 + if (cu >= 0xC0 && cu <= 0xC5) return (u ? "A" : "a"); + if (cu >= 0xC8 && cu <= 0xCB) return (u ? "E" : "e"); + if (cu >= 0xCC && cu <= 0xCF) return (u ? "I" : "i"); + if (cu >= 0xD2 && cu <= 0xD8) return (u ? "O" : "o"); + if (cu >= 0x80 && cu <= 0xA0) return (" "); + + // Latin Extended-A + if (cu >= 0x100 && cu <= 0x105) return (u ? "A" : "a"); + if (cu >= 0x106 && cu <= 0x10D) return (u ? "C" : "c"); + if (cu >= 0x10E && cu <= 0x111) return (u ? "D" : "d"); + if (cu >= 0x112 && cu <= 0x11B) return (u ? "E" : "e"); + if (cu >= 0x11C && cu <= 0x123) return (u ? "G" : "g"); + if (cu >= 0x124 && cu <= 0x127) return (u ? "H" : "h"); + if (cu >= 0x128 && cu <= 0x131) return (u ? "I" : "i"); + if (cu >= 0x136 && cu <= 0x138) return (u ? "K" : "k"); + if (cu >= 0x139 && cu <= 0x142) return (u ? "L" : "l"); + if (cu >= 0x143 && cu <= 0x14B) return (u ? "N" : "n"); + if (cu >= 0x14C && cu <= 0x14F) return (u ? "O" : "o"); + if (cu >= 0x150 && cu <= 0x153) return (u ? "Oe" : "oe"); + if (cu >= 0x156 && cu <= 0x159) return (u ? "R" : "r"); + if (cu >= 0x15A && cu <= 0x161) return (u ? "S" : "s"); + if (cu >= 0x161 && cu <= 0x167) return (u ? "T" : "t"); + if (cu >= 0x176 && cu <= 0x178) return (u ? "Y" : "y"); + if (cu >= 0x179 && cu <= 0x17E) return (u ? "Z" : "z"); + + // General Punctuation + if (cu >= 0x2000 && cu <= 0x200A) return (" "); + if (cu >= 0x200B && cu <= 0x200F) return (""); + if (cu >= 0x2010 && cu <= 0x2015) return ("--"); + if (cu >= 0x2018 && cu <= 0x201B) return ("'"); + if (cu >= 0x201C && cu <= 0x201F) return ("\""); + if (cu >= 0x2029 && cu <= 0x202F) return (" "); + if (cu >= 0x203E && cu <= 0x2040) return ("-"); + if (cu >= 0x2056 && cu <= 0x205E) return ("."); + + return (defaultNormalizer.apply((char) c)); + } + + /** Eats a String of the form "%xx" from a string, where + * xx is a hexadecimal code. If xx is a UTF8 code start, + * tries to complete the UTF8-code and decodes it.*/ + public static char eatPercentage(String a, int[] n) { + // Length 0 + if (!a.startsWith("%") || a.length() < 3) { + n[0] = 0; + return ((char) 0); + } + char c; + // Try to parse first char + try { + c = (char) Integer.parseInt(a.substring(1, 3), 16); + } catch (Exception e) { + n[0] = -1; + return ((char) 0); + } + // For non-UTF8, return the char + int len = Utf8Length(c); + n[0] = 3; + if (len <= 1) return (c); + // Else collect the UTF8 + String dec = "" + c; + for (int i = 1; i < len; i++) { + try { + dec += (char) Integer.parseInt(a.substring(1 + i * 3, 3 + i * 3), 16); + } catch (Exception e) { + return (c); + } + } + // Try to decode the UTF8 + int[] eatLength = new int[1]; + char utf8 = eatUtf8(dec, eatLength); + if (eatLength[0] != len) return (c); + n[0] = len * 3; + return (utf8); + } + + /** Eats an HTML ampersand code from a String*/ + public static char eatAmpersand(String a, int[] n) { + n[0] = 0; + if (!a.startsWith("&")) return ((char) 0); + // Seek to ';' + // We also accept spaces and the end of the String as a delimiter + while (n[0] < a.length() && !Character.isSpaceChar(a.charAt(n[0])) && a.charAt(n[0]) != ';') + n[0]++; + if (n[0] <= 1) { + n[0] = -1; + return ((char) 0); + } + if (n[0] < a.length() && a.charAt(n[0]) == ';') { + a = a.substring(1, n[0]); + n[0]++; + } else { + a = a.substring(1, n[0]); + } + // Hexadecimal characters + if (a.startsWith("#x")) { + try { + return ((char) Integer.parseInt(a.substring(2), 16)); + } catch (Exception e) { + n[0] = -1; + return ((char) 0); + } + } + // Decimal characters + if (a.startsWith("#")) { + try { + return ((char) Integer.parseInt(a.substring(1))); + } catch (Exception e) { + n[0] = -1; + return ((char) 0); + } + } + // Others + if (ampersandMap.get(a) != null) return (ampersandMap.get(a)); + else if (ampersandMap.get(a.toLowerCase()) != null) return (ampersandMap.get(a.toLowerCase())); + n[0] = -1; + return ((char) 0); + } + + /** Tells from the first UTF-8 code character how long the code is. + * Returns -1 if the character is not an UTF-8 code start. + * Returns 1 if the character is ASCII<128*/ + public static int Utf8Length(char c) { + // 0xxx xxxx + if ((c & 0x80) == 0x00) return (1); + // 110x xxxx + if ((c & 0xE0) == 0xC0) return (2); + // 1110 xxxx + if ((c & 0xF0) == 0xE0) return (3); + // 1111 0xxx + if ((c & 0xF8) == 0xF0) return (4); + return (-1); + } + + /** Eats a UTF8 code from a String. There is also a built-in way in Java that converts + * UTF8 to characters and back, but it does not work with all characters. */ + public static char eatUtf8(String a, int[] n) { + if (a.length() == 0) { + n[0] = 0; + return ((char) 0); + } + n[0] = Utf8Length(a.charAt(0)); + if (a.length() >= n[0]) { + switch (n[0]) { + case 1: + return (a.charAt(0)); + case 2: + if ((a.charAt(1) & 0xC0) != 0x80) break; + return ((char) (((a.charAt(0) & 0x1F) << 6) + (a.charAt(1) & 0x3F))); + case 3: + if ((a.charAt(1) & 0xC0) != 0x80 || (a.charAt(2) & 0xC0) != 0x80) break; + return ((char) (((a.charAt(0) & 0x0F) << 12) + ((a.charAt(1) & 0x3F) << 6) + ((a.charAt(2) & 0x3F)))); + case 4: + if ((a.charAt(1) & 0xC0) != 0x80 || (a.charAt(2) & 0xC0) != 0x80 || (a.charAt(3) & 0xC0) != 0x80) break; + return ((char) (((a.charAt(0) & 0x07) << 18) + ((a.charAt(1) & 0x3F) << 12) + ((a.charAt(2) & 0x3F) << 6) + ((a.charAt(3) & 0x3F)))); + } + } + n[0] = -1; + return ((char) 0); + } + + /** Decodes all UTF8 characters in the string*/ + public static String decodeUTF8(String s) { + StringBuilder result = new StringBuilder(); + int[] eatLength = new int[1]; + while (s.length() != 0) { + char c = eatUtf8(s, eatLength); + if (eatLength[0] != -1) { + result.append(c); + s = s.substring(eatLength[0]); + } else { + result.append(s.charAt(0)); + s = s.substring(1); + } + } + return (result.toString()); + } + + /** Decodes all percentage characters in the string*/ + public static String decodePercentage(String s) { + StringBuilder result = new StringBuilder(); + int[] eatLength = new int[1]; + while (s.length() != 0) { + char c = eatPercentage(s, eatLength); + if (eatLength[0] > 1) { + result.append(c); + s = s.substring(eatLength[0]); + } else { + result.append(s.charAt(0)); + s = s.substring(1); + } + } + return (result.toString()); + } + + /** Fabian: This method cannot decode numeric hexadecimal ampersand codes. What is its purpose? TODO*/ + public static String decodeAmpersand_UNKNOWN(String s) { + if (s == null) { + return null; + } + StringBuffer sb = new StringBuffer(s.length()); + while (s != null && s.length() != 0) { + int i = s.indexOf("&"); + if (i == -1) { + sb.append(s); + s = null; + } else { + boolean space = false; + boolean end = false; + sb.append(s.substring(0, i)); + s = s.substring(i); + int j1 = s.indexOf(";"); + int j2 = s.indexOf(" "); + int j = -1; + if (j1 == -1 || j2 == -1) { + if (j1 == -1 && j2 == -1) { + end = true; + j = s.length(); + } else if (j1 == -1) { + j = j2; + } else if (j2 == -1) { + j = j1; + } + } else if (j1 < j2) { + j = j1; + } else if (j1 > j2) { + j = j2; + space = true; + } + String a = s.substring(1, j); + if (ampersandMap.get(a) != null) { + sb.append(ampersandMap.get(a)); + if (space) { + sb.append(' '); + } + } else if (a.startsWith("#")) { + try { + sb.append(((char) Integer.parseInt(a.substring(1)))); + } catch (Exception e) { + sb.append(a); + } + if (space) { + sb.append(' '); + } + } else { + if (end) { + sb.append(s.substring(0, j)); + } else { + sb.append(s.substring(0, j + 1)); + } + } + if (end) { + s = s.substring(j); + } else { + s = s.substring(j + 1); + } + } + } + return sb.toString(); + } + + public static String decodeAmpersand(String s, PositionTracker posTracker) { + if (s == null) { + return null; + } + int pos = 0; + int difference; + StringBuffer sb = new StringBuffer(s.length()); + while (s != null && s.length() != 0) { + int i = s.indexOf("&"); + if (i == -1) { + sb.append(s); + s = null; + } else { + boolean space = false; + boolean end = false; + sb.append(s.substring(0, i)); + s = s.substring(i); + pos += i; + int j1 = s.indexOf(";"); + int j2 = s.indexOf(" "); + int j = -1; + if (j1 == -1 || j2 == -1) { + if (j1 == -1 && j2 == -1) { + end = true; + j = s.length(); + } else if (j1 == -1) { + j = j2; + } else if (j2 == -1) { + j = j1; + } + } else if (j1 < j2) { + j = j1; + } else if (j1 > j2) { + j = j2; + space = true; + } + pos += (j + 1); + String a = s.substring(1, j); + if (ampersandMap.get(a) != null) { + sb.append(ampersandMap.get(a)); + difference = 1 - (j + 1); + if (space) { + sb.append(' '); + difference++; + } + posTracker.addPositionChange(pos, difference); + } else { + if (end) { + sb.append(s.substring(0, j)); + } else { + sb.append(s.substring(0, j + 1)); + } + } + if (end) { + s = s.substring(j); + } else { + s = s.substring(j + 1); + } + } + } + posTracker.closeRun(); + return sb.toString(); + } + + /** Decodes all ampersand sequences in the string*/ + public static String decodeAmpersand(String s) { + StringBuilder result = new StringBuilder(); + int[] eatLength = new int[1];// add this in order to multithread safe + while (s.length() != 0) { + char c = eatAmpersand(s, eatLength); + if (eatLength[0] > 1) { + result.append(c); + s = s.substring(eatLength[0]); + } else { + result.append(s.charAt(0)); + s = s.substring(1); + } + } + return (result.toString()); + } + + /** Decodes all backslash characters in the string */ + public static String decodeBackslash(String s) { + StringBuilder result = new StringBuilder(); + int[] eatLength = new int[1]; + while (s.length() != 0) { + char c = eatBackslash(s, eatLength); + if (eatLength[0] > 1) { + result.append(c); + s = s.substring(eatLength[0]); + } else { + result.append(s.charAt(0)); + s = s.substring(1); + } + } + return (result.toString()); + } + + /** Eats a backslash sequence from a String */ + public static char eatBackslash(String a, int[] n) { + if (!a.startsWith("\\")) { + n[0] = 0; + return ((char) 0); + } + // Unicodes BS u XXXX + if (a.startsWith("\\u")) { + try { + n[0] = 6; + return ((char) Integer.parseInt(a.substring(2, 6), 16)); + } catch (Exception e) { + n[0] = -1; + return ((char) 0); + } + } + // Unicodes BS uu XXXX + if (a.startsWith("\\uu")) { + try { + n[0] = 7; + return ((char) Integer.parseInt(a.substring(3, 7), 16)); + } catch (Exception e) { + n[0] = -1; + return ((char) 0); + } + } + // Classical escape sequences + if (a.startsWith("\\b")) { + n[0] = 2; + return ((char) 8); + } + if (a.startsWith("\\t")) { + n[0] = 2; + return ((char) 9); + } + if (a.startsWith("\\n")) { + n[0] = 2; + return ((char) 10); + } + if (a.startsWith("\\f")) { + n[0] = 2; + return ((char) 12); + } + if (a.startsWith("\\r")) { + n[0] = 2; + return ((char) 13); + } + if (a.startsWith("\\\\")) { + n[0] = 2; + return ('\\'); + } + if (a.startsWith("\\\"")) { + n[0] = 2; + return ('"'); + } + if (a.startsWith("\\'")) { + n[0] = 2; + return ('\''); + } + // Octal codes + n[0] = 1; + while (n[0] < a.length() && a.charAt(n[0]) >= '0' && a.charAt(n[0]) <= '8') + n[0]++; + if (n[0] == 1) { + n[0] = 0; + return ((char) 0); + } + try { + return ((char) Integer.parseInt(a.substring(1, n[0]), 8)); + } catch (Exception e) { + } + n[0] = -1; + return ((char) 0); + } + + /** Replaces all codes in a String by the 16 bit Unicode characters */ + public static String decode(String s) { + StringBuilder b = new StringBuilder(); + int[] eatLength = new int[1]; + while (s.length() > 0) { + char c = eatPercentage(s, eatLength); + if (eatLength[0] <= 0) { + c = eatAmpersand(s, eatLength); + if (eatLength[0] <= 0) { + c = eatBackslash(s, eatLength); + if (eatLength[0] <= 0) { + c = eatUtf8(s, eatLength); + if (eatLength[0] <= 0) { + c = s.charAt(0); + eatLength[0] = 1; + } + } + } + } + b.append(c); + s = s.substring(eatLength[0]); + } + return (b.toString()); + } + + /** Encodes a character to UTF8 (if necessary)*/ + public static String encodeUTF8(int c) { + if (c <= 0x7F) return ("" + (char) c); + if (c <= 0x7FF) return ("" + (char) (0xC0 + ((c >> 6) & 0x1F)) + (char) (0x80 + (c & 0x3F))); + if (c <= 0xFFFF) return ("" + (char) (0xE0 + ((c >> 12) & 0x0F)) + (char) (0x80 + ((c >> 6) & 0x3F)) + (char) (0x80 + (c & 0x3F))); + return ("" + c); + } + + /** Encodes a character to a backslash code (if necessary)*/ + public static String encodeBackslash(char c) { + if (isAlphanumeric(c) || c == ' ') return ("" + c); + String hex = Integer.toHexString(c); + while (hex.length() < 4) + hex = "0" + hex; + return ("\\u" + hex); + } + + /** Encodes a character to a backslash code (if not alphanumeric)*/ + public static String encodeBackslashToAlphanumeric(char c) { + if (isAlphanumeric(c) || c == '_') return ("" + c); + String hex = Integer.toHexString(c); + while (hex.length() < 4) + hex = "0" + hex; + return ("\\u" + hex); + } + + /** Encodes a character to a backslash code (if not ASCII)*/ + public static String encodeBackslashToASCII(char c) { + if (c >= 32 && c < 128 && c != '\\' && c != '"') return ("" + c); + String hex = Integer.toHexString(c); + while (hex.length() < 4) + hex = "0" + hex; + return ("\\u" + hex); + } + + /** Encodes a character to an HTML-Ampersand code (if necessary)*/ + public static String encodeAmpersand(char c) { + String s; + if (null != (s = charToAmpersand.get(c))) return (s); + if (c < 128 && c >= 32) return ("" + c); + else return ("&#" + ((int) c) + ";"); + } + + /** Encodes a character to an HTML-Ampersand code (if necessary)*/ + public static String encodeAmpersandToAlphanumeric(char c) { + if (isAlphanumeric(c) || c == '_') return ("" + c); + return ("&#" + ((int) c) + ";"); + } + + /** Encodes a character to an Percentage code (if necessary). + * If the character is greater than 0x80, the character is converted to + * a UTF8-sequence and this sequence is encoded as percentage codes. */ + public static String encodePercentage(char c) { + if (isAlphanumeric(c)) return ("" + c); + if (c < 16) return ("%0" + Integer.toHexString(c).toUpperCase()); + if (c < 128) return ("%" + Integer.toHexString(c).toUpperCase()); + String s = encodeUTF8(c); + String result = ""; + for (int i = 0; i < s.length(); i++) { + result += "%" + Integer.toHexString(s.charAt(i)).toUpperCase(); + } + return (result); + } + + /** + * Encodes a String with reserved XML characters into a valid xml string for attributes. + * @param str + * @return + */ + public static String encodeXmlAttribute(String str) { + if (str == null) return null; + int len = str.length(); + if (len == 0) return str; + StringBuffer encoded = new StringBuffer(); + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + if (c == '<') encoded.append("<"); + else if (c == '\"') encoded.append("""); + else if (c == '>') encoded.append(">"); + else if (c == '\'') encoded.append("'"); + else if (c == '&') encoded.append("&"); + else encoded.append(c); + } + return encoded.toString(); + } + + /** Tells whether a char is in a range*/ + public static boolean in(char c, char a, char b) { + return (c >= a && c <= b); + } + + /** Tells whether a char is in a string*/ + public static boolean in(char c, String s) { + return (s.indexOf(c) != -1); + } + + /** Tells whether a char is alphanumeric in the sense of URIs*/ + public static boolean isAlphanumeric(char c) { + return (in(c, 'a', 'z') || in(c, 'A', 'Z') || in(c, '0', '9')); + } + + /** Tells whether a char is reserved in the sense of URIs*/ + public static boolean isReserved(char c) { + return (isSubDelim(c) || isGenDelim(c)); + } + + /** Tells whether a char is unreserved in the sense of URIs (not the same as !reserved)*/ + public static boolean isUnreserved(char c) { + return (isAlphanumeric(c) || in(c, "-._~")); + } + + /** Tells whether a string is escaped in the sense of URIs*/ + public static boolean isEscaped(String s) { + return (s.matches("%[0-9A-Fa-f]{2}")); + } + + /** Tells whether a char is a sub-delimiter in the sense of URIs*/ + public static boolean isSubDelim(char c) { + return (in(c, "!$&'()*+,=")); + } + + /** Tells whether a char is a general delimiter in the sense of URIs*/ + public static boolean isGenDelim(char c) { + return (in(c, ":/?#[]@")); + } + + /** Tells whether a char is a valid path component in the sense of URIs*/ + public static boolean isPchar(char c) { + return (isUnreserved(c) || isSubDelim(c) || in(c, "@")); + } + + /** Encodes a char to percentage code, if it is not a path character in the sense of URIs*/ + public static String encodeURIPathComponent(char c) { + if (isPchar(c)) return ("" + c); + else return (Char.encodePercentage(c)); + } + + /** Encodes a char to percentage code, if it is not a path character in the sense of URIs*/ + public static String encodeURIPathComponent(String s) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + result.append(Char.encodeURIPathComponent(s.charAt(i))); + } + return (result.toString()); + } + + /** Encodes a char to percentage code, if it is not a path character in the sense of XMLs*/ + public static String encodeURIPathComponentXML(String s) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + if (s.charAt(i) == '&') result.append(Char.encodePercentage(s.charAt(i))); + else if (s.charAt(i) == '"') result.append(Char.encodePercentage(s.charAt(i))); + else result.append(Char.encodeURIPathComponent(s.charAt(i))); + } + return (result.toString()); + } + + /** Decodes a URI path component*/ + public static String decodeURIPathComponent(String s) { + return (Char.decodePercentage(s)); + } + + /** Encodes a String to UTF8 */ + public static String encodeUTF8(String c) { + StringBuilder r = new StringBuilder(); + for (int i = 0; i < c.length(); i++) { + r.append(encodeUTF8(c.charAt(i))); + } + return (r.toString()); + } + + /** Replaces non-normal characters in a String by Backslash codes */ + public static String encodeBackslash(String c) { + StringBuilder r = new StringBuilder(); + for (int i = 0; i < c.length(); i++) { + r.append(encodeBackslash(c.charAt(i))); + } + return (r.toString()); + } + + /** Replaces non-normal characters in a String by Backslash codes (if not alphanumeric)*/ + public static String encodeBackslashToAlphanumeric(String c) { + StringBuilder r = new StringBuilder(); + for (int i = 0; i < c.length(); i++) { + r.append(encodeBackslashToAlphanumeric(c.charAt(i))); + } + return (r.toString()); + } + + /** Replaces non-normal characters in a String by Backslash codes (if not ASCII)*/ + public static String encodeBackslashToASCII(String c) { + StringBuilder r = new StringBuilder(); + for (int i = 0; i < c.length(); i++) { + r.append(encodeBackslashToASCII(c.charAt(i))); + } + return (r.toString()); + } + + /** Replaces non-normal characters in a String by HTML Ampersand codes */ + public static String encodeAmpersand(String c) { + StringBuilder r = new StringBuilder(); + for (int i = 0; i < c.length(); i++) { + r.append(encodeAmpersand(c.charAt(i))); + } + return (r.toString()); + } + + /** Replaces non-normal characters in a String by HTML Ampersand codes */ + public static String encodeAmpersandToAlphanumeric(String c) { + StringBuilder r = new StringBuilder(); + for (int i = 0; i < c.length(); i++) { + r.append(encodeAmpersandToAlphanumeric(c.charAt(i))); + } + return (r.toString()); + } + + /** Replaces non-normal characters in a String by Percentage codes. + * If a character is greater than 0x80, the character is converted to + * a UTF8-sequence and this sequence is encoded as percentage codes. */ + public static String encodePercentage(String c) { + StringBuilder r = new StringBuilder(); + for (int i = 0; i < c.length(); i++) { + r.append(encodePercentage(c.charAt(i))); + } + return (r.toString()); + } + + /** Decodes all codes in a String and normalizes all chars */ + public static String decodeAndNormalize(String s) { + return (normalize(decode(s))); + } + + /** Normalizes all chars in a String to characters 0x20-0x7F */ + public static String normalize(String s) { + StringBuilder b = new StringBuilder(); + for (int i = 0; i < s.length(); i++) + b.append(normalize(s.charAt(i))); + return (b.toString()); + } + + /** Returns the last character of a String or 0*/ + public static char last(CharSequence s) { + return (s.length() == 0 ? (char) 0 : s.charAt(s.length() - 1)); + } + + /** Returns the String without the last character */ + public static String cutLast(String s) { + return (s.length() == 0 ? "" : s.substring(0, s.length() - 1)); + } + + /** Cuts the last character */ + public static StringBuilder cutLast(StringBuilder s) { + s.setLength(s.length() - 1); + return (s); + } + + /** Returns an HTML-String of the String */ + public static String toHTML(String s) { + return (Char.encodeAmpersand(s).replace(" ", "
")); + } + + /** Returns the chars of a String in hex */ + public static String hexAll(String s) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + result.append(Integer.toHexString(s.charAt(i)).toUpperCase()).append(' '); + } + return (result.toString()); + } + + /** Replaces special characters in the string by hex codes (cannot be undone)*/ + public static String encodeHex(String s) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (isAlphanumeric(c)) result.append(c); + else result.append(Integer.toHexString(s.charAt(i)).toUpperCase()); + } + return (result.toString()); + } + + /** Upcases the first character in a String*/ + public static String upCaseFirst(String s) { + if (s == null || s.length() == 0) return (s); + return (Character.toUpperCase(s.charAt(0)) + s.substring(1)); + } + + /** Lowcases the first character in a String*/ + public static String lowCaseFirst(String s) { + if (s == null || s.length() == 0) return (s); + return (Character.toLowerCase(s.charAt(0)) + s.substring(1)); + } + + /** Returns a string of the given length, fills with spaces if necessary */ + public static CharSequence truncate(CharSequence s, int len) { + if (s.length() == len) return (s); + if (s.length() > len) return (s.subSequence(0, len)); + StringBuilder result = new StringBuilder(s); + while (result.length() < len) + result.append(' '); + return (result); + } + + /** Capitalizes words and lowercases the rest*/ + public static String capitalize(String s) { + StringBuilder result = new StringBuilder(); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (i == 0 || i > 0 && !Character.isLetterOrDigit(s.charAt(i - 1))) c = Character.toUpperCase(c); + else c = Character.toLowerCase(c); + result.append(c); + } + return (result.toString()); + } + + /** TRUE if the Charsequence ends with the string */ + public static boolean endsWith(CharSequence s, String end) { + return (s.length() >= end.length() && s.subSequence(s.length() - end.length(), s.length()).equals(end)); + } + + /** Test routine */ + public static void main(String argv[]) throws Exception { + System.out.println("Enter a string with HTML ampersand codes, umlauts and/or UTF-8 codes and hit ENTER."); + System.out.println("Press CTRL+C to abort"); + BufferedReader in = new BufferedReader(new InputStreamReader(System.in)); + while (true) { + String s = in.readLine(); + System.out.println("Decoded: " + (s = decode(s))); + System.out.println("Normalized: " + normalize(s)); + System.out.println("As UTF8: " + encodeUTF8(s)); + System.out.println("As percentage: " + encodePercentage(s)); + System.out.println("As backslash: " + encodeBackslash(s)); + System.out.println("As ampersand: " + encodeAmpersand(s)); + System.out.println("As URI component: " + encodeURIPathComponent(s)); + } + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/NounGroup.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/NounGroup.java new file mode 100644 index 0000000..1596ec8 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/NounGroup.java @@ -0,0 +1,263 @@ +package org.yago.javatools.parsers; +import java.util.Arrays; +import java.util.List; +import java.util.Set; + +import org.yago.javatools.administrative.D; +import org.yago.javatools.datatypes.FinalSet; + +/** +This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). +It is licensed under the Creative Commons Attribution License +(see http://creativecommons.org/licenses/by/3.0) by +the YAGO-NAGA team (see http://mpii.de/yago-naga). + + + + + +The class NounGroup splits a noun group (given by a String) into its +modifiers and its head.
+Example: +
+     System.out.println(new NounGroup("the United States of America").description());
+     ->
+      NounGroup:
+        Original: the_United_States_of_America
+        Determiner: the
+        Head: State
+        Plural: true
+        preModifiers: United
+        Adjective: 
+        Preposition: of
+        postModifier:
+          NounGroup:
+            Original: America
+            Determiner:
+            Head: America
+            Plural: false
+            preModifiers:
+            Preposition:
+            postModifier:
+
+*/ +public class NounGroup { + + /** Defines just one function from a String to a boolean */ + public interface String2Boolean { + /** Function from a String to a boolean */ + boolean apply(String s); + } + + /** Tells whether a word is an adjective (currently by a simple heuristics */ + public static String2Boolean isAdjective=new String2Boolean() { + public boolean apply(String s) { + return(s.length()>0 && Character.isLowerCase(s.charAt(0)) && + (s.endsWith("al") || s.endsWith("ed") || s.endsWith("ing"))); + } + }; + + /** Contains determiners*/ + public static final Set determiners=new FinalSet( + "the", + "a", + "an", + "this", + "these", + "those" + ); + + /** Holds prepositions (like "of" etc.) */ + public static final FinalSet prepositions=new FinalSet( + ",", + "at", + "about", + "and", + "by", + "for", + "from", + "in", + "of", + "on", + "to", + "with", + "who", + "-", + "\u2248", + "under" + ); + + /** Holds the original noun group */ + protected String original; + + /** Holds the adjective */ + protected String adjective; + + /** Holds the preposition */ + protected String preposition; + + /** Holds the noun group after the preposition */ + protected NounGroup postModifier; + + /** Holds the head of the noun group */ + protected String head; + + /** Holds the modifiers before the head */ + protected String preModifier; + + /** Holds the determiner (if any) */ + protected String determiner; + + /** Returns the adjective. */ + public String adjective() { + return adjective; + } + + /**Returns the determiner. */ + public String determiner() { + return determiner; + } + + /** Returns the head (lowercased singular). */ + public String head() { + return head; + } + + /**Returns the original. */ + public String original() { + return original; + } + + /** Returns the postModifier. */ + public NounGroup postModifier() { + return postModifier; + } + + /** Returns the preModifier. */ + public String preModifier() { + return preModifier; + } + + /** Returns the preposition.*/ + public String preposition() { + return preposition; + } + + /** Returns the full name with the head word stemmed */ + public String stemmed() { + StringBuilder full=new StringBuilder(); + if(preModifier!=null) full.append(preModifier).append(' '); + full.append(PlingStemmer.stem(head.toLowerCase())); + if(adjective!=null) full.append(' ').append(adjective); + if(preposition!=null) full.append(' ').append(preposition); + if(postModifier!=null) full.append(' ').append(postModifier.original()); + return(full.toString()); + } + + /** Stems the head. TRUE if this had any effect */ + public boolean stemHead() { + String stemmed=PlingStemmer.stem(head); + boolean result=!stemmed.equals(head); + head=stemmed; + return(result); + } + /** Constructs a noun group from a String */ + public NounGroup(String s) { + this(Arrays.asList(s.split("[\\s_]+"))); + } + + /** Constructs a noun group from a list of words */ + public NounGroup(List words) { + // Assemble the original + original=words.toString().replace(", ", " "); + original=original.substring(1,original.length()-1); + + // Cut away preceding determiners + if(words.size()>0 && determiners.contains(words.get(0).toLowerCase())) { + determiner=words.get(0).toLowerCase(); + words=words.subList(1, words.size()); + } + + // Locate prepositions (but not in first or last position) + int prepPos; + for(prepPos=1;prepPos1 && isAdjective.apply(words.get(prepPos-1))) { + adjective=words.get(prepPos-1); + words=words.subList(0, prepPos-1); + } else { + words=words.subList(0, prepPos); + } + } + + if(words.size()==0) return; + + head=words.get(words.size()-1); + if(words.size()>1) { + preModifier=words.subList(0, words.size()-1).toString().replace(", ", "_"); + preModifier=preModifier.substring(1, preModifier.length()-1); + } + } + + + /** Checks if the originals match */ + public boolean equals(Object o) { + return(o instanceof NounGroup && ((NounGroup)o).original.equals(original)); + } + + /** Returns the original */ + public String toString() { + return(original); + } + + /** Returns all fields in a String */ + public String description() { + return("NounGroup:\n"+ + " Original: "+original+"\n"+ + " Stemmed: "+stemmed()+"\n"+ + " Determiner: "+determiner+"\n"+ + " preModifiers: "+preModifier+"\n"+ + " Head: "+head+"\n"+ + " Adjective: "+adjective+"\n"+ + " Preposition: "+preposition+"\n"+ + " postModifier: \n"+(postModifier==null?"":postModifier.description())); + } + + /** Test method */ + public static void main(String[] args) throws Exception { + D.p("Enter a noun group and press ENTER. Press CTRL+C to abort"); +// while(true) { +// D.p(new NounGroup(D.r()).description()); +// } + + D.p(new NounGroup("Star_Trek_characters").description()); + } + +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java new file mode 100644 index 0000000..277efc0 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PlingStemmer.java @@ -0,0 +1,923 @@ +package org.yago.javatools.parsers; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.Map; +import java.util.Set; + +import org.yago.javatools.datatypes.FinalMap; +import org.yago.javatools.datatypes.FinalSet; + +/** +This class is part of the Java Tools (see http://mpii.de/yago-naga/javatools). +It is licensed under the Creative Commons Attribution License +(see http://creativecommons.org/licenses/by/3.0) by +the YAGO-NAGA team (see http://mpii.de/yago-naga). + + + + + + The PlingStemmer stems an English noun (plural or singular) to its singular + form. It deals with "firemen"->"fireman", it knows Greek stuff like + "appendices"->"appendix" and yes, it was a lot of work to compile these exceptions. + Examples: +
+      System.out.println(PlingStemmer.stem("boy"));
+      ----> boy
+      System.out.println(PlingStemmer.stem("boys"));
+      ----> boy
+      System.out.println(PlingStemmer.stem("biophysics"));
+      ---->  biophysics
+      System.out.println(PlingStemmer.stem("automata"));
+      ----> automaton
+      System.out.println(PlingStemmer.stem("genus"));
+      ----> genus
+      System.out.println(PlingStemmer.stem("emus"));
+      ----> emu
+  

+ + There are a number of word forms that can either be plural or singular. + Examples include "physics" (the science or the plural of "physic" (the + medicine)), "quarters" (the housing or the plural of "quarter" (1/4)) + or "people" (the singular of "peoples" or the plural of "person"). In + these cases, the stemmer assumes the word is a plural form and returns + the singular form. The methods isPlural, isSingular and isPluralAndSingular + can be used to differentiate the cases.

+ + It cannot be guaranteed that the stemmer correctly stems a plural word + or correctly ignores a singular word -- let alone that it treats an + ambiguous word form in the way expected by the user.

+ + The PlingStemmer uses material from WordNet.

+ It requires the class FinalSet from the + Java Tools. +*/ +public class PlingStemmer { + + /** Tells whether a word form is plural. This method just checks whether the + * stem method alters the word */ + public static boolean isPlural(String s) { + return(!s.equals(stem(s))); + } + + /** Tells whether a word form is singular. Note that a word can be both plural and singular */ + public static boolean isSingular(String s) { + return(singAndPlur.contains(s.toLowerCase()) || !isPlural(s)); + } + + /** Tells whether a word form is the singular form of one word and at + * the same time the plural form of another.*/ + public static boolean isSingularAndPlural(String s) { + return(singAndPlur.contains(s.toLowerCase())); + } + + /** Cuts a suffix from a string (that is the number of chars given by the suffix) */ + public static String cut(String s, String suffix) { + return(s.substring(0,s.length()-suffix.length())); + } + + /** Returns true if a word is probably not Latin */ + public static boolean noLatin(String s) { + return(s.indexOf('h')>0 || s.indexOf('j')>0 || s.indexOf('k')>0 || + s.indexOf('w')>0 || s.indexOf('y')>0 || s.indexOf('z')>0 || + s.indexOf("ou")>0 || s.indexOf("sh")>0 || s.indexOf("ch")>0 || + s.endsWith("aus")); + } + + /** Returns true if a word is probably Greek */ + private static boolean greek(String s) { + return(s.indexOf("ph")>0 || s.indexOf('y')>0 && s.endsWith("nges")); + } + + /** Stems an English noun */ + public static String stem(String s) { + String stem = s; + + // Handle irregular ones + String irreg=irregular.get(s); + if(irreg!=null) return(stem=irreg); + + // -on to -a + if(categoryON_A.contains(s)) return(stem=cut(s,"a")+"on"); + + // -um to -a + if(categoryUM_A.contains(s)) return(stem=cut(s,"a")+"um"); + + // -x to -ices + if(categoryIX_ICES.contains(s)) return(stem=cut(s,"ices")+"ix"); + + // -o to -i + if(categoryO_I.contains(s)) return(stem=cut(s,"i")+"o"); + + // -se to ses + if(categorySE_SES.contains(s)) return(stem=cut(s,"s")); + + // -is to -es + if(categoryIS_ES.contains(s) || s.endsWith("theses")) return(stem=cut(s,"es")+"is"); + + // -us to -i + if(categoryUS_I.contains(s)) return(stem=cut(s,"i")+"us"); + //Wrong plural + if(s.endsWith("uses") && (categoryUS_I.contains(cut(s,"uses")+"i") || + s.equals("genuses") || s.equals("corpuses"))) return(stem=cut(s,"es")); + + // -ex to -ices + if(categoryEX_ICES.contains(s)) return(stem=cut(s,"ices")+"ex"); + + // Words that do not inflect in the plural + if(s.endsWith("ois") || s.endsWith("itis") || category00.contains(s) || categoryICS.contains(s)) return(stem=s); + + // -en to -ina + // No other common words end in -ina + if(s.endsWith("ina")) return(stem=cut(s,"en")); + + // -a to -ae + // No other common words end in -ae + if(s.endsWith("ae")) return(stem=cut(s,"e")); + + // -a to -ata + // No other common words end in -ata + if(s.endsWith("ata")) return(stem=cut(s,"ta")); + + // trix to -trices + // No common word ends with -trice(s) + if(s.endsWith("trices")) return(stem=cut(s,"trices")+"trix"); + + // -us to -us + //No other common word ends in -us, except for false plurals of French words + //Catch words that are not latin or known to end in -u + if(s.endsWith("us") && !s.endsWith("eaus") && !s.endsWith("ieus") && !noLatin(s) + && !categoryU_US.contains(s)) return(stem=s); + + // -tooth to -teeth + // -goose to -geese + // -foot to -feet + // -zoon to -zoa + //No other common words end with the indicated suffixes + if(s.endsWith("teeth")) return(stem=cut(s,"teeth")+"tooth"); + if(s.endsWith("geese")) return(stem=cut(s,"geese")+"goose"); + if(s.endsWith("feet")) return(stem=cut(s,"feet")+"foot"); + if(s.endsWith("zoa")) return(stem=cut(s,"zoa")+"zoon"); + + // -eau to -eaux + //No other common words end in eaux + if(s.endsWith("eaux")) return(stem=cut(s,"x")); + + // -ieu to -ieux + //No other common words end in ieux + if(s.endsWith("ieux")) return(stem=cut(s,"x")); + + // -nx to -nges + // Pay attention not to kill words ending in -nge with plural -nges + // Take only Greek words (works fine, only a handfull of exceptions) + if(s.endsWith("nges") && greek(s)) return(stem=cut(s,"nges")+"nx"); + + // -[sc]h to -[sc]hes + //No other common word ends with "shes", "ches" or "she(s)" + //Quite a lot end with "che(s)", filter them out + if(s.endsWith("shes") || s.endsWith("ches") && !categoryCHE_CHES.contains(s)) return(stem=cut(s,"es")); + + // -ss to -sses + // No other common singular word ends with "sses" + // Filter out those ending in "sse(s)" + if(s.endsWith("sses") && !categorySSE_SSES.contains(s) && !s.endsWith("mousses")) return(stem=cut(s,"es")); + + // -x to -xes + // No other common word ends with "xe(s)" except for "axe" + if(s.endsWith("xes") && !s.equals("axes")) return(stem=cut(s,"es")); + + // -[nlw]ife to -[nlw]ives + //No other common word ends with "[nlw]ive(s)" except for olive + if(s.endsWith("nives") || s.endsWith("lives") && !s.endsWith("olives") || + s.endsWith("wives")) return(stem=cut(s,"ves")+"fe"); + + // -[aeo]lf to -ves exceptions: valve, solve + // -[^d]eaf to -ves exceptions: heave, weave + // -arf to -ves no exception + if(s.endsWith("alves") && !s.endsWith("valves") || + s.endsWith("olves") && !s.endsWith("solves") || + s.endsWith("eaves") && !s.endsWith("heaves") && !s.endsWith("weaves") || + s.endsWith("arves") ) return(stem=cut(s,"ves")+"f"); + + // -y to -ies + // -ies is very uncommon as a singular suffix + // but -ie is quite common, filter them out + if(s.endsWith("ies") && !categoryIE_IES.contains(s)) return(stem=cut(s,"ies")+"y"); + + // -o to -oes + // Some words end with -oe, so don't kill the "e" + if(s.endsWith("oes") && !categoryOE_OES.contains(s)) return(stem=cut(s,"es")); + + // -s to -ses + // -z to -zes + // no words end with "-ses" or "-zes" in singular + if(s.endsWith("ses") || s.endsWith("zes") ) return(stem=cut(s,"es")); + + // - to -s + if(s.endsWith("s") && !s.endsWith("ss") && !s.endsWith("is")) return(stem=cut(s,"s")); + + return stem; + } + + /** Words that end in "-se" in their plural forms (like "nurse" etc.)*/ + public static Set categorySE_SES=new FinalSet( + "nurses", + "cruises", + "premises", + "houses" + ); + + /** Words that do not have a distinct plural form (like "atlas" etc.)*/ + public static Set category00=new FinalSet( + "alias", + "asbestos", + "atlas", + "barracks", + "bathos", + "bias", + "breeches", + "britches", + "canvas", + "chaos", + "clippers", + "contretemps", + "corps", + "cosmos", + "crossroads", + "diabetes", + "ethos", + "gallows", + "gas", + "graffiti", + "headquarters", + "herpes", + "high-jinks", + "innings", + "jackanapes", + "lens", + "means", + "measles", + "mews", + "mumps", + "news", + "pathos", + "pincers", + "pliers", + "proceedings", + "rabies", + "rhinoceros", + "sassafras", + "scissors", + "series", + "shears", + "species", + "tuna" + ); + + /** Words that change from "-um" to "-a" (like "curriculum" etc.), listed in their plural forms*/ + public static Set categoryUM_A=new FinalSet( + "addenda", + "agenda", + "aquaria", + "bacteria", + "candelabra", + "compendia", + "consortia", + "crania", + "curricula", + "data", + "desiderata", + "dicta", + "emporia", + "enconia", + "errata", + "extrema", + "gymnasia", + "honoraria", + "interregna", + "lustra", + "maxima", + "media", + "memoranda", + "millenia", + "minima", + "momenta", + "optima", + "ova", + "phyla", + "quanta", + "rostra", + "spectra", + "specula", + "stadia", + "strata", + "symposia", + "trapezia", + "ultimata", + "vacua", + "vela" + ); + + /** Words that change from "-on" to "-a" (like "phenomenon" etc.), listed in their plural forms*/ + public static Set categoryON_A=new FinalSet( + "aphelia", + "asyndeta", + "automata", + "criteria", + "hyperbata", + "noumena", + "organa", + "perihelia", + "phenomena", + "prolegomena" + ); + + /** Words that change from "-o" to "-i" (like "libretto" etc.), listed in their plural forms*/ + public static Set categoryO_I=new FinalSet( + "alti", + "bassi", + "canti", + "contralti", + "crescendi", + "libretti", + "soli", + "soprani", + "tempi", + "virtuosi" + ); + + /** Words that change from "-us" to "-i" (like "fungus" etc.), listed in their plural forms*/ + public static Set categoryUS_I=new FinalSet( + "alumni", + "bacilli", + "cacti", + "foci", + "fungi", + "genii", + "hippopotami", + "incubi", + "nimbi", + "nuclei", + "nucleoli", + "octopi", + "radii", + "stimuli", + "styli", + "succubi", + "syllabi", + "termini", + "tori", + "umbilici", + "uteri" + ); + + /** Words that change from "-ix" to "-ices" (like "appendix" etc.), listed in their plural forms*/ + public static Set categoryIX_ICES=new FinalSet( + "appendices", + "cervices" + ); + + /** Words that change from "-is" to "-es" (like "axis" etc.), listed in their plural forms*/ + public static Set categoryIS_ES=new FinalSet( + // plus everybody ending in theses + "analyses", + "axes", + "bases", + "crises", + "diagnoses", + "ellipses", + "emphases", + "neuroses", + "oases", + "paralyses", + "synopses" + ); + + /** Words that change from "-oe" to "-oes" (like "toe" etc.), listed in their plural forms*/ + public static Set categoryOE_OES=new FinalSet( + "aloes", + "backhoes", + "beroes", + "canoes", + "chigoes", + "cohoes", + "does", + "felloes", + "floes", + "foes", + "gumshoes", + "hammertoes", + "hoes", + "hoopoes", + "horseshoes", + "leucothoes", + "mahoes", + "mistletoes", + "oboes", + "overshoes", + "pahoehoes", + "pekoes", + "roes", + "shoes", + "sloes", + "snowshoes", + "throes", + "tic-tac-toes", + "tick-tack-toes", + "ticktacktoes", + "tiptoes", + "tit-tat-toes", + "toes", + "toetoes", + "tuckahoes", + "woes" + ); + + /** Words that change from "-ex" to "-ices" (like "index" etc.), listed in their plural forms*/ + public static Set categoryEX_ICES=new FinalSet( + "apices", + "codices", + "cortices", + "indices", + "latices", + "murices", + "pontifices", + "silices", + "simplices", + "vertices", + "vortices" + ); + + /** Words that change from "-u" to "-us" (like "emu" etc.), listed in their plural forms*/ + public static Set categoryU_US=new FinalSet( + "apercus", + "barbus", + "cornus", + "ecrus", + "emus", + "fondus", + "gnus", + "iglus", + "mus", + "nandus", + "napus", + "poilus", + "quipus", + "snafus", + "tabus", + "tamandus", + "tatus", + "timucus", + "tiramisus", + "tofus", + "tutus" + ); + + /** Words that change from "-sse" to "-sses" (like "finesse" etc.), listed in their plural forms*/ + public static Set categorySSE_SSES=new FinalSet( + //plus those ending in mousse + "bouillabaisses", + "coulisses", + "crevasses", + "crosses", + "cuisses", + "demitasses", + "ecrevisses", + "fesses", + "finesses", + "fosses", + "impasses", + "lacrosses", + "largesses", + "masses", + "noblesses", + "palliasses", + "pelisses", + "politesses", + "posses", + "tasses", + "wrasses" + ); + + /** Words that change from "-che" to "-ches" (like "brioche" etc.), listed in their plural forms*/ + public static Set categoryCHE_CHES=new FinalSet( + "adrenarches", + "attaches", + "avalanches", + "barouches", + "brioches", + "caches", + "caleches", + "caroches", + "cartouches", + "cliches", + "cloches", + "creches", + "demarches", + "douches", + "gouaches", + "guilloches", + "headaches", + "heartaches", + "huaraches", + "menarches", + "microfiches", + "moustaches", + "mustaches", + "niches", + "panaches", + "panoches", + "pastiches", + "penuches", + "pinches", + "postiches", + "psyches", + "quiches", + "schottisches", + "seiches", + "soutaches", + "synecdoches", + "thelarches", + "troches" + ); + + /** Words that end with "-ics" and do not exist as nouns without the 's' (like "aerobics" etc.)*/ + public static Set categoryICS=new FinalSet( + "aerobatics", + "aerobics", + "aerodynamics", + "aeromechanics", + "aeronautics", + "alphanumerics", + "animatronics", + "apologetics", + "architectonics", + "astrodynamics", + "astronautics", + "astrophysics", + "athletics", + "atmospherics", + "autogenics", + "avionics", + "ballistics", + "bibliotics", + "bioethics", + "biometrics", + "bionics", + "bionomics", + "biophysics", + "biosystematics", + "cacogenics", + "calisthenics", + "callisthenics", + "catoptrics", + "civics", + "cladistics", + "cryogenics", + "cryonics", + "cryptanalytics", + "cybernetics", + "cytoarchitectonics", + "cytogenetics", + "diagnostics", + "dietetics", + "dramatics", + "dysgenics", + "econometrics", + "economics", + "electromagnetics", + "electronics", + "electrostatics", + "endodontics", + "enterics", + "ergonomics", + "eugenics", + "eurhythmics", + "eurythmics", + "exodontics", + "fibreoptics", + "futuristics", + "genetics", + "genomics", + "geographics", + "geophysics", + "geopolitics", + "geriatrics", + "glyptics", + "graphics", + "gymnastics", + "hermeneutics", + "histrionics", + "homiletics", + "hydraulics", + "hydrodynamics", + "hydrokinetics", + "hydroponics", + "hydrostatics", + "hygienics", + "informatics", + "kinematics", + "kinesthetics", + "kinetics", + "lexicostatistics", + "linguistics", + "lithoglyptics", + "liturgics", + "logistics", + "macrobiotics", + "macroeconomics", + "magnetics", + "magnetohydrodynamics", + "mathematics", + "metamathematics", + "metaphysics", + "microeconomics", + "microelectronics", + "mnemonics", + "morphophonemics", + "neuroethics", + "neurolinguistics", + "nucleonics", + "numismatics", + "obstetrics", + "onomastics", + "orthodontics", + "orthopaedics", + "orthopedics", + "orthoptics", + "paediatrics", + "patristics", + "patristics", + "pedagogics", + "pediatrics", + "periodontics", + "pharmaceutics", + "pharmacogenetics", + "pharmacokinetics", + "phonemics", + "phonetics", + "phonics", + "photomechanics", + "physiatrics", + "pneumatics", + "poetics", + "politics", + "pragmatics", + "prosthetics", + "prosthodontics", + "proteomics", + "proxemics", + "psycholinguistics", + "psychometrics", + "psychonomics", + "psychophysics", + "psychotherapeutics", + "robotics", + "semantics", + "semiotics", + "semitropics", + "sociolinguistics", + "stemmatics", + "strategics", + "subtropics", + "systematics", + "tectonics", + "telerobotics", + "therapeutics", + "thermionics", + "thermodynamics", + "thermostatics" + ); + + /** Words that change from "-ie" to "-ies" (like "auntie" etc.), listed in their plural forms*/ + public static Set categoryIE_IES=new FinalSet( + "aeries", + "anomies", + "aunties", + "baddies", + "beanies", + "birdies", + "boccies", + "bogies", + "bolshies", + "bombies", + "bonhomies", + "bonxies", + "booboisies", + "boogies", + "boogie-woogies", + "bookies", + "booties", + "bosies", + "bourgeoisies", + "brasseries", + "brassies", + "brownies", + "budgies", + "byrnies", + "caddies", + "calories", + "camaraderies", + "capercaillies", + "capercailzies", + "cassies", + "catties", + "causeries", + "charcuteries", + "chinoiseries", + "collies", + "commies", + "cookies", + "coolies", + "coonties", + "cooties", + "corries", + "coteries", + "cowpies", + "cowries", + "cozies", + "crappies", + "crossties", + "curies", + "dachsies", + "darkies", + "dassies", + "dearies", + "dickies", + "dies", + "dixies", + "doggies", + "dogies", + "dominies", + "dovekies", + "eyries", + "faeries", + "falsies", + "floozies", + "folies", + "foodies", + "freebies", + "gaucheries", + "gendarmeries", + "genies", + "ghillies", + "gillies", + "goalies", + "goonies", + "grannies", + "grotesqueries", + "groupies", + "hankies", + "hippies", + "hoagies", + "honkies", + "hymies", + "indies", + "junkies", + "kelpies", + "kilocalories", + "knobkerries", + "koppies", + "kylies", + "laddies", + "lassies", + "lies", + "lingeries", + "magpies", + "magpies", + "marqueteries", + "mashies", + "mealies", + "meanies", + "menageries", + "millicuries", + "mollies", + "facts1", + "moxies", + "neckties", + "newbies", + "nighties", + "nookies", + "oldies", + "organdies", + "panties", + "parqueteries", + "passementeries", + "patisseries", + "pies", + "pinkies", + "pixies", + "porkpies", + "potpies", + "prairies", + "preemies", + "premies", + "punkies", + "pyxies", + "quickies", + "ramies", + "reveries", + "rookies", + "rotisseries", + "scrapies", + "sharpies", + "smoothies", + "softies", + "stoolies", + "stymies", + "swaggies", + "sweeties", + "talkies", + "techies", + "ties", + "tooshies", + "toughies", + "townies", + "veggies", + "walkie-talkies", + "wedgies", + "weenies", + "weirdies", + "yardies", + "yuppies", + "zombies" + ); + + /** Maps irregular Germanic English plural nouns to their singular form */ + public static Map irregular=new FinalMap( + "beefs","beef", + "beeves","beef", + "brethren","brother", + "busses","bus", + "cattle","cattlebeast", + "children","child", + "corpora","corpus", + "ephemerides","ephemeris", + "firemen","fireman", + "genera","genus", + "genies","genie", + "genii","genie", + "kine","cow", + "lice","louse", + "men","man", + "mice","mouse", + "mongooses","mongoose", + "monies","money", + "mythoi","mythos", + "octopodes","octopus", + "octopuses","octopus", + "oxen","ox", + "people","person", + "soliloquies","soliloquy", + "throes","throes", + "trilbys","trilby", + "women","woman" + ); + + /** Contains word forms that can either be plural or singular */ + public static Set singAndPlur=new FinalSet( + "acoustics", + "aestetics", + "aquatics", + "basics", + "ceramics", + "classics", + "cosmetics", + "dermatoglyphics", + "dialectics", + "dynamics", + "esthetics", + "ethics", + "harmonics", + "heroics", + "isometrics", + "mechanics", + "metrics", + "statistics", + "optic", + "people", + "physics", + "polemics", + "premises", + "propaedeutics", + "pyrotechnics", + "quadratics", + "quarters", + "statistics", + "tactics", + "tropics" + ); + + /** Test routine */ + public static void main(String[] argv) throws Exception { + System.out.println("Enter an English word in plural form and press ENTER"); + BufferedReader in=new BufferedReader(new InputStreamReader(System.in)); + while(true) { + String w=in.readLine(); + if(w.length()==0) break; + if(isPlural(w)) System.out.println("This word is plural"); + if(isSingular(w)) System.out.println("This word is singular"); + System.out.println("Stemmed to singular: "+stem(w)); + } + } +} diff --git a/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java new file mode 100644 index 0000000..4c07240 --- /dev/null +++ b/WikipediaCategoryProcessor/src/main/java/org/yago/javatools/parsers/PositionTracker.java @@ -0,0 +1,288 @@ +package org.yago.javatools.parsers; + +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedMap; +import java.util.TreeMap; + +/** + * This class is part of the Java Tools (see + * http://mpii.de/yago-naga/javatools). It is licensed under the Creative + * Commons Attribution License (see http://creativecommons.org/licenses/by/3.0) + * by the YAGO-NAGA team (see http://mpii.de/yago-naga). + * + * This class implements position change trackers that keep track of position + * changes within a String, e.g. caused through normalization etc. + * This allows for instance, given a position int the normalized string + * to get the corresponding position in the original non-normalized string + * + * + * + * backward position tracker - + * tracking several replacement/text changes allowing to trace a position in the modified + * text back to the corresp. position in the original text + * for the other direction see ForwardPositionTracker + * + * @author smetzger */ +public class PositionTracker { + + + private SortedMappositionMap; + private SortedMappositionChanges; + private SortedMapold2NewMap; + private int accumulatedModifier=0; + + public PositionTracker(){ + positionMap=new TreeMap(); + positionChanges=new TreeMap(); + old2NewMap=new TreeMap(); + } + + + public void addPositionChange(int pos, int modifier){ + if(modifier!=0){ + int oldModifier=0; + old2NewMap.put(pos, modifier); + accumulatedModifier+=modifier; + if(positionChanges.containsKey(pos+accumulatedModifier)) + oldModifier=positionChanges.get(pos+accumulatedModifier); + positionChanges.put(pos+accumulatedModifier,modifier*-1+oldModifier); + } + } + + + + /** Closes the current changing run by Merging new position changes into the existing position change map + * after each round (one round=consecutive changes along the text) you need to call closeRun() before submitting more position changes from a new round, + * i.e. whenever you passed the string to be modified once call closeRun() before starting to run over the string again with more replacements + * Do this every time you ran once over the text making changes to be tracked*/ + public void closeRun() { + if(positionChanges.isEmpty()) + return; + + + SortedMap temp=positionChanges; + + //adapt old positions to new mapping + while(!positionMap.isEmpty()){ + Integer key=positionMap.firstKey(); + Collection modifiers=old2NewMap.headMap(key+1).values(); + Integer newposition=key; + for(Iterator it=modifiers.iterator(); it.hasNext(); newposition+=it.next()){} + Integer value=positionMap.get(key); + if(positionChanges.containsKey(newposition)) + value+=positionChanges.get(newposition); + positionChanges.put(newposition, value); + positionMap.remove(key); + } + + positionChanges=positionMap; + positionMap=temp; + old2NewMap.clear(); + accumulatedModifier=0; + return; + } + + + + + /** Merges new position changes (given with the inversed old2new mapping) into the existing position change map*/ +/* private void addPositionMappings(SortedMap newPosChanges, + SortedMap old2NewMap) { + + + TreeMap newMap=new TreeMap(); + + //adapt old positions to new mapping + while(!positionMap.isEmpty()){ + Integer key=positionMap.firstKey(); + Collection modifiers=old2NewMap.headMap(key+1).values(); + Integer newposition=key; + for(Iterator it=modifiers.iterator(); it.hasNext(); newposition+=it.next()){} + Integer value=positionMap.get(key); + if(newMap.containsKey(newposition)) + value+=newMap.get(newposition); + newMap.put(newposition, value); + positionMap.remove(key); + } + while(!newPosChanges.isEmpty()){ + Integer key=newPosChanges.firstKey(); + Integer value=newPosChanges.get(key); + if(newMap.containsKey(key)) + value+=newMap.get(key); + newMap.put(key, value); + newPosChanges.remove(key); + } + positionMap=newMap; + old2NewMap.clear(); + return; + } + */ + + public Integer translatePosition(Integer pos) { + SortedMap headMap=positionMap.headMap(pos+1); + Integer modifier=0; + for(Iterator it=headMap.values().iterator(); it.hasNext(); modifier+=it.next()){} +/* if(headMap.size()>1){ TODO: Possible Optimization if we assume positions are asked in ascending order + headMap.clear(); + posMap.put(pos, modifier); + }*/ + return pos+modifier; + } + + + + + + + + + + + + /** forward position change tracking - keeping track of several rounds of text modifications allowing to trace a position in the original + * text along the modifications to the corresp. position in the modified text + * after each round (one round=consecutive changes along the text) you need to call closeRun() before submitting more position changes from a new round, + * i.e. whenever you passed the string to be modified once call closeRun() before starting to run over the string again with more replacements + * REMARK: NOT TESTED WITH MORE THAN ONE ROUND! may be ERRORNOUS with multiple rounds -> use with care (works with a single round though) + * @author smetzger + * + */ + public static class ForwardPositionTracker { + + + private SortedMappositionMap; + private SortedMappositionChanges; + //private SortedMapnew2OldMap; + private PositionTracker new2OldTracker=null; + private int accumulatedModifier=0; + + public ForwardPositionTracker(){ + positionMap=new TreeMap(); + positionChanges=new TreeMap(); + // new2OldMap=new TreeMap(); + new2OldTracker=new PositionTracker(); + + } + + + public void addPositionChange(int pos, int modifier){ + if(modifier!=0){ + positionChanges.put(pos,modifier); + accumulatedModifier+=modifier; + /*if(new2OldMap.containsKey(pos+accumulatedModifier)) + oldModifier=new2OldMap.get(pos+accumulatedModifier); + new2OldMap.put(pos+accumulatedModifier, -1*modifier+oldModifier); + } */ + new2OldTracker.addPositionChange(pos, modifier); + } + } + + + + + + + + /** Closes the current changing run by Merging new position changes into the existing position change map + * Do this every time you ran once over the text making changes to be tracked*/ + public void closeRun() { + if(positionChanges.isEmpty()) + return; + + + for(Map.Entry change:positionChanges.entrySet()){ + Integer positionInOrigStream=new2OldTracker.translatePosition(change.getKey()); + if(positionMap.containsKey(positionInOrigStream)) + positionMap.put(positionInOrigStream, change.getValue()+positionMap.get(positionInOrigStream)); + else + positionMap.put(positionInOrigStream, change.getValue()); + } + + positionChanges.clear(); + accumulatedModifier=0; + new2OldTracker.closeRun(); + + return; + } + + + + /** tells whether a position in the original stream has been cut away by some change operation, + * such that translating it usually would make not to much sense + * @return true, iff the given position has been cut away, false otherwise (i.e. false if it should be mappable) + * TODO: current version ONLY WORKS SECURELY WHEN THERE IS ONLY ONE POSITION CHANGE RUN WITHOUT OVERLAPPING CHANGES! + * as soon as there are more than one change runs, or changes that overlap, we would need to check all following changes instead of only the next one */ + public boolean hasBeenCutAway(Integer pos){ + SortedMap tailMap=positionMap.tailMap(pos+1); + if(tailMap.isEmpty()) + return false; + Integer key=tailMap.firstKey(); + Integer modifier=tailMap.get(key); + if(modifier<0 && key+modifier<=pos ) + return true; + else + return false; + /* this does not work for the general case (had it the wrong way aroung), but can be used to implement it + Integer key=null; + Iterator it=tailMap.keySet().iterator(); + while(it.hasNext()){ + key=it.next(); + Integer mod=tailMap.get(key); + if(mod<0 && key-mod>=pos) + return true; + } + return false;*/ + } + + public Integer translatePosition(Integer pos) { + SortedMap headMap=positionMap.headMap(pos+1); + Integer modifier=0; + for(Iterator it=headMap.values().iterator(); it.hasNext(); modifier+=it.next()){} + /* if(headMap.size()>1){ Optimization if we assume positions are asked in ascending order + headMap.clear(); + posMap.put(pos, modifier); + }*/ + return pos+modifier; + } + + /** also handles positions inside text parts that have been cut out properly + * + * TODO: current version ONLY WORKS SECURELY WHEN THERE IS ONLY ONE POSITION CHANGE RUN WITHOUT OVERLAPPING CHANGES! + * as soon as there are more than one change runs, or changes that overlap, we would need to check all following changes instead of only the next one */ + public Integer translatePositionExactly(Integer pos) { + + SortedMap tailMap=positionMap.tailMap(pos+1); + if(tailMap.isEmpty()) + return translatePosition(pos); + else{ + Integer key=tailMap.firstKey(); + Integer modifier=tailMap.get(key); + return translatePosition(Math.min(pos,key+modifier)); + } + +/* + * That version does it the wrong way around + * SortedMap headMap=positionMap.headMap(pos+1); + Integer modifier=0; + Integer key=null, value=null; + Iterator it=headMap.keySet().iterator(); + while(it.hasNext()){ + key=it.next(); + value=headMap.get(key); + if(value<0) + modifier+=Math.max(key-pos, value); + }*/ + /* if(headMap.size()>1){ Optimization if we assume positions are asked in ascending order + headMap.clear(); + posMap.put(pos, modifier); + } + return pos+modifier; */ + + } + + } + + +} diff --git a/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java b/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java new file mode 100644 index 0000000..02a30d8 --- /dev/null +++ b/WikipediaCategoryProcessor/src/test/java/org/karsha/wikipediacategoryprocessor/AppTest.java @@ -0,0 +1,38 @@ +package org.karsha.wikipediacategoryprocessor; + +import junit.framework.Test; +import junit.framework.TestCase; +import junit.framework.TestSuite; + +/** + * Unit test for simple App. + */ +public class AppTest + extends TestCase +{ + /** + * Create the test case + * + * @param testName name of the test case + */ + public AppTest( String testName ) + { + super( testName ); + } + + /** + * @return the suite of tests being tested + */ + public static Test suite() + { + return new TestSuite( AppTest.class ); + } + + /** + * Rigourous Test :-) + */ + public void testApp() + { + assertTrue( true ); + } +}