diff --git a/.gitignore b/.gitignore index 21100e3..06122cb 100644 --- a/.gitignore +++ b/.gitignore @@ -100,6 +100,9 @@ ENV/ # mypy .mypy_cache/ +# macOS +.DS_Store + .idea/ data/*/* \ No newline at end of file diff --git a/README.md b/README.md index c5f6ba2..266cd2f 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,9 @@ # 🤖 Neural SPARQL Machines A LSTM-based Machine Translation Approach for Question Answering. -![alt text](http://www.liberai.org/img/flag-uk-160px.png "English") -![alt text](http://www.liberai.org/img/seq2seq-webexport-160px.png "seq2seq") -![alt text](http://www.liberai.org/img/flag-sparql-160px.png "SPARQL") +![British flag.](http://www.liberai.org/img/flag-uk-160px.png "English") +![Seq2Seq neural network.](http://www.liberai.org/img/seq2seq-webexport-160px.png "seq2seq") +![Semantic triple flag.](http://www.liberai.org/img/flag-sparql-160px.png "SPARQL") ## Code @@ -23,7 +23,9 @@ Install TensorFlow (e.g., `pip install tensorflow`). The template used in the paper can be found in a file such as `annotations_monument.tsv`. To generate the training data, launch the following command. + ```bash +mkdir data/monument_300 python generator.py --templates data/annotations_monument.csv --output data/monument_300 ``` @@ -35,16 +37,19 @@ python build_vocab.py data/monument_300/data_300.sparql > data/monument_300/voca ``` Count lines in `data_.*` + ```bash -NUMLINES= $(echo awk '{ print $1}' | cat data/monument_300/data_300.sparql | wc -l) +NUMLINES=$(echo awk '{ print $1}' | cat data/monument_300/data_300.sparql | wc -l) echo $NUMLINES # 7097 ``` Split the `data_.*` files into `train_.*`, `dev_.*`, and `test_.*` (usually 80-10-10%). + + ```bash cd data/monument_300/ -python ../../split_in_train_dev_test.py --lines $NUMLINES --dataset data.sparql +python ../../split_in_train_dev_test.py --lines $NUMLINES --dataset data_300.sparql ``` #### Pre-generated data @@ -53,7 +58,8 @@ Alternatively, you can extract pre-generated data from `data/monument_300.zip` a ### Training -Launch `train.sh` to train the model. The first parameter is the prefix of the data directory. The second parameter is the number of training epochs. + +Now go back to the initail directory and launch `train.sh` to train the model. The first parameter is the prefix of the data directory and the second parameter is the number of training epochs. ```bash sh train.sh data/monument_300 120000 @@ -69,13 +75,15 @@ Predict the SPARQL sentence for a given question with a given model. sh ask.sh data/monument_300 "where is edward vii monument located in?" ``` -## Paper +## Papers + +### Soru and Marx et al., 2017 * Permanent URI: http://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html * arXiv: https://arxiv.org/abs/1708.07624 ``` -@proceedings{soru-marx-2017, +@inproceedings{soru-marx-2017, author = "Tommaso Soru and Edgard Marx and Diego Moussallem and Gustavo Publio and Andr\'e Valdestilhas and Diego Esteves and Ciro Baron Neto", title = "{SPARQL} as a Foreign Language", year = "2017", @@ -84,18 +92,23 @@ sh ask.sh data/monument_300 "where is edward vii monument located in?" } ``` -## Contact - -* Neural SPARQL Machines [mailing list](https://groups.google.com/forum/#!forum/neural-sparql-machines). -* Follow the [project on ResearchGate](https://www.researchgate.net/project/Neural-SPARQL-Machines). - +### Soru et al., 2018 +* NAMPI Website: https://uclmr.github.io/nampi/ +* arXiv: https://arxiv.org/abs/1806.10478 -## Aman Mehta - [GSoC] - -Hi, this is a first commit test on gsoc-aman branch. -Please find my blog [here](https://amanmehta-maniac.github.io) - here you will find details about what this project had to offer. -1. To be able to generate the dataset automatically, there is a five step pipeline which you would have to follow, guided at 'PIPELINE' file. -2. Otherwise you can directly use the data generated under data/place_v2.zip and data/Compositions_v2.zip +``` +@inproceedings{soru-marx-nampi2018, + author = "Tommaso Soru and Edgard Marx and Andr\'e Valdestilhas and Diego Esteves and Diego Moussallem and Gustavo Publio", + title = "Neural Machine Translation for Query Construction and Composition", + year = "2018", + journal = "ICML Workshop on Neural Abstract Machines \& Program Induction (NAMPI v2)", + url = "https://arxiv.org/abs/1806.10478", +} +``` +## Contact +* Primary contacts: [Tommaso Soru](http://tommaso-soru.it) and [Edgard Marx](http://emarx.org). +* Neural SPARQL Machines [mailing list](https://groups.google.com/forum/#!forum/neural-sparql-machines). +* Follow the [project on ResearchGate](https://www.researchgate.net/project/Neural-SPARQL-Machines). diff --git a/analyse.py b/analyse.py old mode 100644 new mode 100755 index 2cb7904..3d0f108 --- a/analyse.py +++ b/analyse.py @@ -1,3 +1,15 @@ +#!/usr/bin/env python +""" + +Neural SPARQL Machines - Analysis and validation of translated questions into queries. + +'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 +https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html +https://arxiv.org/abs/1708.07624 + +Version 0.1.0-akaha + +""" import argparse import collections import json diff --git a/build_vocab.py b/build_vocab.py index 3590576..aa980cb 100755 --- a/build_vocab.py +++ b/build_vocab.py @@ -1,5 +1,14 @@ #!/usr/bin/env python """ + +Neural SPARQL Machines - Build the vocabulary. + +'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 +https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html +https://arxiv.org/abs/1708.07624 + +Version 0.0.4 + Usage: python build_vocab.py data.en > vocab.en """ import numpy as np diff --git a/data/movies_300.zip b/data/movies_300.zip new file mode 100644 index 0000000..abc9cdd Binary files /dev/null and b/data/movies_300.zip differ diff --git a/filter_dataset.py b/filter_dataset.py old mode 100644 new mode 100755 index 6c576e9..264c67d --- a/filter_dataset.py +++ b/filter_dataset.py @@ -1,3 +1,15 @@ +#!/usr/bin/env python +""" + +Neural SPARQL Machines - Filter dataset by a given criterion. + +'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 +https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html +https://arxiv.org/abs/1708.07624 + +Version 0.1.0-akaha + +""" import argparse import collections import json diff --git a/generator_test.py b/generator_test.py old mode 100644 new mode 100755 index 64fc122..e630c04 --- a/generator_test.py +++ b/generator_test.py @@ -1,3 +1,15 @@ +#!/usr/bin/env python +""" + +Neural SPARQL Machines - Generator test unit. + +'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 +https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html +https://arxiv.org/abs/1708.07624 + +Version 0.1.0-akaha + +""" import generator import generator_utils import operator diff --git a/generator_utils.py b/generator_utils.py old mode 100644 new mode 100755 index f6c3c32..4c906ad --- a/generator_utils.py +++ b/generator_utils.py @@ -1,3 +1,15 @@ +#!/usr/bin/env python +""" + +Neural SPARQL Machines - Generator utils. + +'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 +https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html +https://arxiv.org/abs/1708.07624 + +Version 0.0.4 + +""" import collections import httplib import json @@ -33,12 +45,6 @@ def save_cache ( file, cache ): with open(file, 'w') as outfile: json.dump(ordered, outfile) -# proxies = {'http': 'http://proxy.iiit.ac.in:8080/', 'https': 'http://proxy.iiit.ac.in:8080/'} -# proxy_handler = urllib2.ProxyHandler(proxies) -# opener = urllib2.build_opener(proxy_handler) -# urllib2.install_opener(opener) - - def query_dbpedia( query ): param = dict() param["default-graph-uri"] = GRAPH diff --git a/gsoc/aman/.DS_Store b/gsoc/aman/.DS_Store new file mode 100644 index 0000000..c051808 Binary files /dev/null and b/gsoc/aman/.DS_Store differ diff --git a/GS_with_mve.csv b/gsoc/aman/GS_with_mve.csv similarity index 100% rename from GS_with_mve.csv rename to gsoc/aman/GS_with_mve.csv diff --git a/PIPELINE b/gsoc/aman/PIPELINE similarity index 100% rename from PIPELINE rename to gsoc/aman/PIPELINE diff --git a/gsoc/aman/README.md b/gsoc/aman/README.md new file mode 100644 index 0000000..75a1b65 --- /dev/null +++ b/gsoc/aman/README.md @@ -0,0 +1,6 @@ +## Aman: Work done during DBpedia's Google Summer of Code 2018 + +Hi, please find my blog here: https://amanmehta-maniac.github.io. - You will find details about what this project based on https://github.com/AKSW/NSpM had to offer. + +1. To be able to generate the dataset automatically, there is a five step pipeline which you would have to follow, guided at 'PIPELINE' file. +2. Otherwise you can directly use the data generated under `./data/place_v2.zip` and `./data/Compositions_v2.zip`. diff --git a/composite_template.py b/gsoc/aman/composite_template.py similarity index 100% rename from composite_template.py rename to gsoc/aman/composite_template.py diff --git a/decision_tree.py b/gsoc/aman/decision_tree.py similarity index 100% rename from decision_tree.py rename to gsoc/aman/decision_tree.py diff --git a/delete_lines.py b/gsoc/aman/delete_lines.py similarity index 100% rename from delete_lines.py rename to gsoc/aman/delete_lines.py diff --git a/entity_errors.py b/gsoc/aman/entity_errors.py similarity index 100% rename from entity_errors.py rename to gsoc/aman/entity_errors.py diff --git a/error_analysis.py b/gsoc/aman/error_analysis.py similarity index 100% rename from error_analysis.py rename to gsoc/aman/error_analysis.py diff --git a/final_formatting.py b/gsoc/aman/final_formatting.py similarity index 100% rename from final_formatting.py rename to gsoc/aman/final_formatting.py diff --git a/get_metadata.py b/gsoc/aman/get_metadata.py similarity index 100% rename from get_metadata.py rename to gsoc/aman/get_metadata.py diff --git a/get_properties.py b/gsoc/aman/get_properties.py similarity index 100% rename from get_properties.py rename to gsoc/aman/get_properties.py diff --git a/integrate.py b/gsoc/aman/integrate.py similarity index 100% rename from integrate.py rename to gsoc/aman/integrate.py diff --git a/temp/log_place b/gsoc/aman/log_place similarity index 100% rename from temp/log_place rename to gsoc/aman/log_place diff --git a/metadata_place.txt b/gsoc/aman/metadata_place.txt similarity index 100% rename from metadata_place.txt rename to gsoc/aman/metadata_place.txt diff --git a/temp/place_labels b/gsoc/aman/place_labels similarity index 100% rename from temp/place_labels rename to gsoc/aman/place_labels diff --git a/temp/placetemp/data_300.en b/gsoc/aman/placetemp/data_300.en similarity index 100% rename from temp/placetemp/data_300.en rename to gsoc/aman/placetemp/data_300.en diff --git a/temp/placetemp/data_300.sparql b/gsoc/aman/placetemp/data_300.sparql similarity index 100% rename from temp/placetemp/data_300.sparql rename to gsoc/aman/placetemp/data_300.sparql diff --git a/temp/placetemp/resource_dump.json b/gsoc/aman/placetemp/resource_dump.json similarity index 100% rename from temp/placetemp/resource_dump.json rename to gsoc/aman/placetemp/resource_dump.json diff --git a/range_place.py b/gsoc/aman/range_place.py similarity index 100% rename from range_place.py rename to gsoc/aman/range_place.py diff --git a/remove_en.py b/gsoc/aman/remove_en.py similarity index 100% rename from remove_en.py rename to gsoc/aman/remove_en.py diff --git a/script.py b/gsoc/aman/script.py similarity index 100% rename from script.py rename to gsoc/aman/script.py diff --git a/script2.py b/gsoc/aman/script2.py similarity index 100% rename from script2.py rename to gsoc/aman/script2.py diff --git a/sparql_generator.py b/gsoc/aman/sparql_generator.py similarity index 100% rename from sparql_generator.py rename to gsoc/aman/sparql_generator.py diff --git a/temp/temp.py b/gsoc/aman/temp.py similarity index 100% rename from temp/temp.py rename to gsoc/aman/temp.py diff --git a/tempout b/gsoc/aman/tempout similarity index 100% rename from tempout rename to gsoc/aman/tempout diff --git a/test.txt b/gsoc/aman/test.txt similarity index 100% rename from test.txt rename to gsoc/aman/test.txt diff --git a/test_pipeline/1 b/gsoc/aman/test_pipeline/1 similarity index 100% rename from test_pipeline/1 rename to gsoc/aman/test_pipeline/1 diff --git a/test_pipeline/2 b/gsoc/aman/test_pipeline/2 similarity index 100% rename from test_pipeline/2 rename to gsoc/aman/test_pipeline/2 diff --git a/test_pipeline/dbpedia-201610-properties.tsv b/gsoc/aman/test_pipeline/dbpedia-201610-properties.tsv similarity index 100% rename from test_pipeline/dbpedia-201610-properties.tsv rename to gsoc/aman/test_pipeline/dbpedia-201610-properties.tsv diff --git a/test_pipeline/get_properties.py b/gsoc/aman/test_pipeline/get_properties.py similarity index 100% rename from test_pipeline/get_properties.py rename to gsoc/aman/test_pipeline/get_properties.py diff --git a/training_log b/gsoc/aman/training_log similarity index 100% rename from training_log rename to gsoc/aman/training_log diff --git a/split_in_train_dev_test.py b/split_in_train_dev_test.py old mode 100644 new mode 100755 index b6f04d9..eee057d --- a/split_in_train_dev_test.py +++ b/split_in_train_dev_test.py @@ -1,3 +1,15 @@ +#!/usr/bin/env python +""" + +Neural SPARQL Machines - Split into train, dev, and test sets. + +'SPARQL as a Foreign Language' by Tommaso Soru and Edgard Marx et al., SEMANTiCS 2017 +https://w3id.org/neural-sparql-machines/soru-marx-semantics2017.html +https://arxiv.org/abs/1708.07624 + +Version 0.0.4 + +""" import argparse import random import os