-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7ef2b7f
Showing
77 changed files
with
9,406 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"presets": ["@babel/preset-env"] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
module.exports = { | ||
'env': { | ||
'node': true, | ||
'commonjs': true, | ||
'es6': true | ||
}, | ||
'extends': 'eslint:recommended', | ||
'parserOptions': { | ||
'ecmaVersion': 2018, | ||
'sourceType': 'module', | ||
}, | ||
'rules': { | ||
'indent': [ | ||
'warn', | ||
2, | ||
{ | ||
'VariableDeclarator': {'var': 2, 'let': 2, 'const': 3}, | ||
'SwitchCase': 1, | ||
'MemberExpression': 1, | ||
'CallExpression': {'arguments': 'first'}, | ||
'ArrayExpression': 'first', | ||
'ObjectExpression': 'first', | ||
'ignoredNodes': ['ConditionalExpression'] | ||
}, | ||
], | ||
'quotes': [ | ||
'warn', | ||
'single' | ||
], | ||
'semi': [ | ||
'warn', | ||
'always' | ||
], | ||
'no-console': [ | ||
'warn', | ||
{ 'allow': ['warn', 'error'] } | ||
], | ||
'no-trailing-spaces': [ | ||
'warn' | ||
] | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# Logs | ||
logs | ||
*.log | ||
npm-debug.log* | ||
yarn-debug.log* | ||
yarn-error.log* | ||
|
||
# Diagnostic reports (https://nodejs.org/api/report.html) | ||
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json | ||
|
||
# Runtime data | ||
pids | ||
*.pid | ||
*.seed | ||
*.pid.lock | ||
|
||
# Directory for instrumented libs generated by jscoverage/JSCover | ||
lib-cov | ||
|
||
# Coverage directory used by tools like istanbul | ||
coverage | ||
|
||
# nyc test coverage | ||
.nyc_output | ||
|
||
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) | ||
.grunt | ||
|
||
# Bower dependency directory (https://bower.io/) | ||
bower_components | ||
|
||
# node-waf configuration | ||
.lock-wscript | ||
|
||
# Compiled binary addons (https://nodejs.org/api/addons.html) | ||
build/Release | ||
|
||
# Dependency directories | ||
node_modules/ | ||
jspm_packages/ | ||
|
||
# TypeScript v1 declaration files | ||
typings/ | ||
|
||
# Optional npm cache directory | ||
.npm | ||
|
||
# Optional eslint cache | ||
.eslintcache | ||
|
||
# Optional REPL history | ||
.node_repl_history | ||
|
||
# Output of 'npm pack' | ||
*.tgz | ||
|
||
# Yarn Integrity file | ||
.yarn-integrity | ||
|
||
# dotenv environment variables file | ||
.env | ||
.env.test | ||
|
||
# parcel-bundler cache (https://parceljs.org/) | ||
.cache | ||
|
||
# next.js build output | ||
.next | ||
|
||
# nuxt.js build output | ||
.nuxt | ||
|
||
# vuepress build output | ||
.vuepress/dist | ||
|
||
# Serverless directories | ||
.serverless/ | ||
|
||
# FuseBox cache | ||
.fusebox/ | ||
|
||
# DynamoDB Local files | ||
.dynamodb/ | ||
/.project | ||
/.settings | ||
/benchmarks.json | ||
/benchmarks.txt | ||
/dist | ||
|
||
# remote sync config file | ||
.remote-sync.json | ||
|
||
# Output | ||
/cache* | ||
/dataset* | ||
/charts* | ||
/experiments* | ||
/wikipedia-dump* | ||
/eval* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
MIT License | ||
|
||
Copyright (c) 2022 Friedrich Schiller University Jena | ||
Copyright (c) 2022 German Aerospace Center (DLR) | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
[![DOI](https://zenodo.org/badge/.svg)](https://zenodo.org/badge/latestdoi/) | ||
[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/fusion-jena/wiki-category-consistency/blob/master/LICENSE) | ||
|
||
# Consistency between Wikidata and Wikipedia Categories | ||
|
||
- We perform an analysis of consistencies between [Wikipedia category](https://en.wikipedia.org/wiki/Wikipedia:Categorization) members and their Wikidata counterpart entities retrieved by executing the SPARQL queries attached to the Wikidata categories under the property [Wikidata SPARQL query equivalent](https://www.wikidata.org/wiki/Property:P3921). | ||
- We focus on comparing the member sets of the two sources, and automatically investigating possible reasons making them not identical, in addition to a comparison of the consistency of the information provided within the Wikidata category itself. | ||
- For that, we propose a pipeline for the generation and evaluation of relevant data (categories and Wikidata/Wikipedia entities sets). | ||
- Wikipedia category members are retrieved by traversing the Wikipedia category hierarchy in all available languages. | ||
- This repository provides the steps needed to either reproduce the current results or conduct a new experiment using other Wikipedia/Wikidata versions. | ||
- The pipeline works using both Wikipedia/Wikidata public endpoints and SQL/JSON dumps. | ||
- Our experiments were run using **the Wikidata JSON dump of 2022-05-02 and the Wikipedia SQL dumps of 2022-05-01**. | ||
|
||
# Candidate Generation and Evaluation | ||
|
||
![approach!](figs/approach.png) | ||
|
||
- The figure above depicts the workflow for data generation and evaluation (see paper for more details). | ||
|
||
## Cache Population | ||
|
||
- We provide two options to run the generation workflow: | ||
|
||
### From Dumps | ||
|
||
1. First, two kind of dump versions should be selected: | ||
- [Wikidata JSON Dump](https://dumps.wikimedia.org/wikidatawiki/entities/) (`wikidata-<version>-all.json.gz`) | ||
- [Wikipedia SQL Dumps](https://dumps.wikimedia.org/backup-index.html) (available dataset versions could be checked by visiting e.g., [enwiki](https://dumps.wikimedia.org/enwiki/) for English Wikipedia) | ||
|
||
2. Second, Setup a MariaDB database (place where Wikipedia SQL Dumps will be imported) | ||
- Install MariaDB : `sudo apt-get install mariadb-server`. | ||
- Set root password: | ||
``` | ||
$ sudo mysql -u root | ||
MariaDB [(none)]> SET PASSWORD = PASSWORD('DB_PASSWORD'); | ||
MariaDB [(none)]> update mysql.user set plugin = 'mysql_native_password' where User='root'; | ||
MariaDB [(none)]> FLUSH PRIVILEGES; | ||
``` | ||
- Create a Database: | ||
``` | ||
$ sudo mysql -u root | ||
MariaDB [(none)]> create database <DB_NAME> character set binary; | ||
Query OK, 1 row affected (0.00 sec) | ||
MariaDB [(none)]> use <DB_NAME>; | ||
Database changed | ||
``` | ||
- Optimize Database import by setting following parameter in `/etc/mysql/my.cnf` and then restart the database server `service mysql restart`. | ||
``` | ||
wait_timeout = 604800 | ||
innodb_buffer_pool_size = 8G | ||
innodb_log_buffer_size = 1G | ||
innodb_log_file_size = 512M | ||
innodb_flush_log_at_trx_commit = 2 | ||
innodb_doublewrite = 0 | ||
innodb_write_io_threads = 16 | ||
``` | ||
3. Update [configuration file](https://github.com/fusion-jena/wiki-category-consistency/blob/master/src/cache-population/config.js) in `./src/cache-population/config.js` | ||
``` | ||
// wikidata dump version | ||
wdDump: path.join(__dirname,'..', '..', '..', 'wikidata-<version>-all.json.gz'), | ||
// wikipedia dump version | ||
dumpDate : <wikipedia-version>, | ||
// wikipedia database user | ||
user: 'root', | ||
// wikipedia database password | ||
password: <DB_PASSWORD>, | ||
// wikipedia database name | ||
databaseName: <DB_NAME>, | ||
``` | ||
4. Install [Node.js (minimum v14.16.1)](https://nodejs.org/en/) | ||
5. First download the repository and install dependencies: run `npm install` in the project root folder. | ||
6. Populate caches from dumps : `npm run runner` | ||
7. Continue with the steps described under [Candidate Generation and Cleaning](#candidate-generation-and-cleaning) | ||
### From Endpoints | ||
- This option does not need local setup or prior cache population. | ||
- It allows to directly send requests to Wikidata/Wikipedia public endpoints: | ||
- [Wikidata public endpoint](https://query.wikidata.org/sparql) | ||
- [MediaWiki API](https://www.mediawiki.org/wiki/API:Main_page) | ||
- To generate relevant data from endpoints, the steps under [Candidate Generation and Cleaning](#candidate-generation-and-cleaning) can directly be followed. | ||
## Candidate Generation and Cleaning | ||
- **To reproduce the current experiments**, one can make use of the already filled caches provided on Zenodo: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6913134.svg)](https://doi.org/10.5281/zenodo.6913134). | ||
- A collection of SQLite database files containing all data retrieved from the Wikidata JSON dump of 2022-05-02 and the Wikipedia SQL dumps of 2022-05-01. | ||
- To reproduce current experiments or perform new ones, the steps below should be followed (start directly with step 3 if Node.js and dependencies were already installed in [From Dumps](#from-dumps)): | ||
1. Install [Node.js (minimum v14.16.1)](https://nodejs.org/en/) | ||
2. First download the repository and install dependencies: run `npm install` in the project root folder. | ||
3. | ||
* *Generate from Endpoints* : setup `endpointEnabled: true` in `./src/config/config.js`. | ||
* *Generate from Dumps* : setup `endpointEnabled: false` in `./src/config/config.js`. | ||
* *Reproduce current experiments* : setup `endpointEnabled: false` in `./src/config/config.js`. In the root folder create a folder `./cache/`, unzip `wiki-category-consistency-cache.zip` and copy the content of the underlying folder (cache) in the newly created `cache` folder. | ||
4. To generate candidate categories run `npm run generateCandidate` in the root folder. The output files can be found under `./dataset/` . The output is a JSON file containing all needed raw data needed for further processing: `./dataset/raw-data.json`, in addition to log and statistics files. | ||
5. Next, for candidate cleaning, run `npm run cleanCandidate` in the root folder. The output is a JSON file with selected category entries: `./dataset/finalData.json` together with statistics file. | ||
- The output of the candidate generation and cleaning steps using the Wikidata JSON dump of 2022-05-02 and the Wikipedia SQL dumps of 2022-05-01 (`./dataset/raw-data.json`, `./dataset/finalData.json`, and accompanying files) are provided on Zenodo: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6913282.svg)](https://doi.org/10.5281/zenodo.6913282). | ||
## Evaluation | ||
- The evaluation takes as input the file with cleaned category entries `./dataset/finalData.json`. | ||
1. Generate the needed evaluation metrics by running `npm run compareSPARQL` in the root folder. The output is a JSON file `./eval/compare.json` together with an additional statistics file. | ||
2. Plot the charts by running `npm run generatePlots` in the root folder. The output is a collection of html files under the folder `./charts/`. | ||
- Experiment results (`./eval/compare.json` with a statistics file) using the previously mentioned dump versions, are provided on Zenodo: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6913332.svg)](https://doi.org/10.5281/zenodo.6913332). | ||
<!---## Cite , consider updating codemeta with paper link and also zenodo metadata--> | ||
## License | ||
This project is licensed under the [MIT License](https://git.rz.uni-jena.de/fusion/project/dlr-knowledge-modeling/wiki-category-consistency/-/blob/master/LICENSE). | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
{ | ||
"@context": "https://doi.org/10.5063/schema/codemeta-2.0", | ||
"@type": "SoftwareSourceCode", | ||
"license": "https://spdx.org/licenses/MIT", | ||
"codeRepository": "", | ||
"dateCreated": "", | ||
"datePublished": "", | ||
"dateModified": "", | ||
"downloadUrl": "", | ||
"issueTracker": "", | ||
"name": "wiki-category-consistency", | ||
"version": "1.0.0", | ||
"identifier": "", | ||
"description": "Source code for Evaluating the consistency between Wikipedia and Wikidata categories", | ||
"referencePublication": "", | ||
"keywords": [ | ||
"Wikidata", | ||
"Wikipedia" | ||
], | ||
"programmingLanguage": [ | ||
"JavaScript" | ||
], | ||
"runtimePlatform": [ | ||
"nodejs" | ||
], | ||
"softwareRequirements": [ | ||
"node version >= v14.16.1" | ||
], | ||
"relatedLink": [ | ||
"https://doi.org/10.5281/zenodo.6913134", | ||
"https://doi.org/10.5281/zenodo.6913282", | ||
"https://doi.org/10.5281/zenodo.6913332" | ||
|
||
|
||
], | ||
"author": [ | ||
{ | ||
"@type": "Person", | ||
"@id": "https://orcid.org/0000-0001-8896-8208", | ||
"givenName": "Leila", | ||
"familyName": "Feddoul", | ||
"email": "[email protected]", | ||
"affiliation": { | ||
"@type": "Organization", | ||
"name": "Heinz Nixdorf Chair for Distributed Information Systems, Friedrich Schiller University Jena, Jena, Germany" | ||
} | ||
} | ||
], | ||
"contributor": [ | ||
{ | ||
"@type": "Person", | ||
"@id": "https://orcid.org/0000-0002-0964-4457", | ||
"givenName": "Sirko", | ||
"familyName": "Schindler", | ||
"email": "[email protected]", | ||
"affiliation": { | ||
"@type": "Organization", | ||
"name": "Institute of Data Science, German Aerospace Center DLR, Jena, Germany" | ||
} | ||
} | ||
] | ||
} |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.