Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tool for processing Wikipedia Categories #23

Merged
merged 1 commit into from
Sep 23, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions WikipediaCategoryProcessor/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
/target/
/nbactions.xml
/nbactions-release-profile.xml

69 changes: 69 additions & 0 deletions WikipediaCategoryProcessor/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.karsha</groupId>
<artifactId>WikipediaCategoryProcessor</artifactId>
<version>1.0-SNAPSHOT</version>
<packaging>jar</packaging>

<name>WikipediaCategoryProcessor</name>
<url>http://maven.apache.org</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
</properties>

<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.25</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>4.3.1</version>
</dependency>
<dependency>
<groupId>com.jayway.jsonpath</groupId>
<artifactId>json-path</artifactId>
<version>0.8.1</version>
</dependency>

<dependency>
<groupId>org.apache.clerezza.ext</groupId>
<artifactId>org.json.simple</artifactId>
<version>0.3-incubating</version>
</dependency>
<dependency>
<groupId>com.google.api-client</groupId>
<artifactId>google-api-client</artifactId>
<version>1.16.0-rc</version>
</dependency>

<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queries</artifactId>
<version>4.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>4.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>4.3.1</version>
</dependency>

</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/



/**
*
* Date Author Changes
* Jul 20, 2013 Kasun Perera Created
*
*/

package org.dbpedia.kasun.categoryprocessor;



/**
* TODO- describe the purpose of the class
*
*/
public class Category {

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
/**
* Date Author Changes Jul 6, 2013 Kasun Perera Created
*
*/
package org.dbpedia.kasun.categoryprocessor;


import java.io.*;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;

/**
* TODO- describe the purpose of the class
*
*/
public class CategoryDB
{



public static int getCategoryPageCount( int threshold )
{
DB_connection con = new DB_connection();
Connection connection = con.dbConnect();
PreparedStatement ps = null;
ResultSet rs = null;
int updateQuery = 0;

String query = "SELECT COUNT(*) FROM `page_category` WHERE `cat_subcats`=0 AND `cat_pages`< ? ";


try
{
ps = connection.prepareStatement( query );
ps.setInt( 1, threshold );

rs = ps.executeQuery();
int nodeId = 0;
while ( rs.next() )
{
nodeId = rs.getInt( 1 );
}
return nodeId;
} catch ( SQLException e )
{
e.printStackTrace();
return 0;
}

}

public static void getCategoryByName(String line) throws IOException
{
DB_connection con = new DB_connection();
Connection connection = con.dbConnect();

FileWriter outFile;

PreparedStatement ps = null;
ResultSet rs = null;
int updateQuery = 0;
String temp = null;



// System.out.println(line);
// System.out.println(temp);

String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` LIKE ? ";
//String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` = ? ";
//String query = "SELECT cat_id, cat_title,cat_pages,cat_subcats,cat_files,cat_hidden FROM `category` WHERE `cat_title` ="+catTitle;


try
{
ps = connection.prepareStatement( query );
// ps.setString( 1, temp );
ps.setString( 1, line );
rs = ps.executeQuery();
int count = 0;

if ( rs.next() )
{
do
{
//outFile = new FileWriter( "C:\\Users\\lsf\\Documents\\NetBeansProjects\\CategoryProcesor\\results_dir\\category_match_article_pages.txt", true );
//outFile.append( rs.getString( "cat_id" ) + "\t" + rs.getString( "cat_title" ) + "\t" + rs.getString( "cat_pages" ) + "\t" + rs.getString( "cat_subcats" ) + "\t" + rs.getString( "cat_files" ) + "\t" + rs.getString( "cat_hidden" ) + "\n" );
// outFile.close();
insertCategory( rs.getInt( "cat_id"), rs.getString( "cat_title" ), rs.getInt( "cat_pages"), rs.getInt( "cat_subcats"), rs.getInt( "cat_files"), rs.getBoolean( "cat_hidden" ) );
count++;
if(count>1){
System.out.println( count+" count is over one " + line);
}
} while ( rs.next() );
} else
{

outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categories_not_found_in_category_table_2.txt", true );
outFile.append( line+ "\n" );
outFile.close();

//System.out.println( line );
// No data
}



connection.close();
} catch ( SQLException e )
{
e.printStackTrace();
// return 0;
}



}


public static void getCategoryDirectedByArticlePage(String line) throws IOException
{
DB_connection con = new DB_connection();
Connection connection = con.dbConnect();



String lineArr[];
PreparedStatement ps = null;
ResultSet rs = null;
int updateQuery = 0;

lineArr=line.split("\t");

// System.out.println(line);
// System.out.println(temp);
String query = "SELECT cl_from, cl_to, cl_type FROM `categorylinks` WHERE `cl_from` =" + lineArr[0].trim() ;


try
{
ps = connection.prepareStatement( query );
// ps.setString( 1, temp );
//ps.setString( 1, catTitle );
rs = ps.executeQuery();
int count = 0;

if ( rs.next() )
{
do
{
FileWriter outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_match_article_pages_v1.txt", true );
outFile.append( rs.getInt( "cl_from" ) + "\t" + rs.getString( "cl_to" ) + "\t" + rs.getString( "cl_type" ) + "\n" );
outFile.close();
count++;
} while ( rs.next() );
} else
{

FileWriter outFileCatNotFound = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_not_found_article_pages_v1.txt", true );
outFileCatNotFound.append( line + "\n" );
outFileCatNotFound.close();

//System.out.println( line +"\t no category found");
// No data
}



connection.close();
} catch ( SQLException e )
{
e.printStackTrace();
// return 0;
}
//}
//}



}

public static void getCategoryLinkByCatName(String line) throws IOException
{
DB_connection con = new DB_connection();
Connection connection = con.dbConnect();



// String lineArr[];
PreparedStatement ps = null;
ResultSet rs = null;
int updateQuery = 0;

// lineArr=line.split("\t");

// System.out.println(line);
// System.out.println(temp);
String query = "SELECT cl_from FROM `categorylinks` WHERE `cl_to` LIKE " + line.trim() ;


try
{
ps = connection.prepareStatement( query );
// ps.setString( 1, temp );
//ps.setString( 1, catTitle );
rs = ps.executeQuery();
int count = 0;

if ( rs.next() )
{
do
{

//if caegory does not have
if(!PageDB.isArticlePage( rs.getInt("cl_from") )){

}
// FileWriter outFile = new FileWriter( "F:\\Blogs\\GSOC 2013\\DbPedia\\Task 2- processing wikipedia catogories\\results_new\\categorylinks_match_article_pages_v1.txt", true );
// outFile.append( rs.getInt( "cl_from" ) + "\t" + rs.getString( "cl_to" ) + "\t" + rs.getString( "cl_type" ) + "\n" );
// outFile.close();
// count++;
} while ( rs.next() );
}



connection.close();
} catch ( SQLException e )
{
e.printStackTrace();
// return 0;
}
//}
//}



}

public static void insertCategory( int cat_id,String cat_title, int cat_pages,int cat_subcats,int cat_files,boolean cat_hidden)
{
DB_connection con = new DB_connection();
Connection connection = con.dbConnect();
PreparedStatement ps = null;
ResultSet rs = null;
int updateQuery = 0;
/*
* `cat_id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`cat_title` varbinary(255) NOT NULL DEFAULT '',
`cat_pages` int(11) NOT NULL DEFAULT '0',
`cat_subcats` int(11) NOT NULL DEFAULT '0',
`cat_files` int(11) NOT NULL DEFAULT '0',
`cat_hidden` tinyint(1) unsigned NOT NULL DEFAULT '0',
*/

String query = "INSERT IGNORE INTO page_category(cat_id,cat_title,cat_pages,cat_subcats,cat_files,cat_hidden) VALUES (?,?,?,?,?,?)";


try
{
ps = connection.prepareStatement(query);
ps.setInt(1, cat_id);
ps.setString( 2, cat_title);
ps.setInt(3, cat_pages);
ps.setInt( 4, cat_subcats);
ps.setInt( 5, cat_files);
ps.setBoolean( 6, cat_hidden);
updateQuery = ps.executeUpdate();

connection.close();

}
catch(SQLException e)
{
e.printStackTrace();
// return null;
}

}
}
Loading