diff --git a/.circleci/config.yml b/.circleci/config.yml index f5008ccb..49c8fab3 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -23,7 +23,7 @@ jobs: - run: unzip grakn-core-1.4.3.zip - run: nohup grakn-core-1.4.3/grakn server start - run: grakn-core-1.4.3/graql console -k test_schema -f kglib/kgcn/test_data/schema.gql - - run: bazel test //... --test_output=streamed --force_python PY3 --python_path $(which python) + - run: bazel test //kglib/... --test_output=streamed --force_python PY3 --python_path $(which python) test-deploy-pip: machine: true @@ -37,7 +37,17 @@ jobs: - run: date +%s > VERSION - run: cat VERSION - run: bazel run //:deploy-pip -- test $PYPI_TEST_SERVER_USERNAME $PYPI_TEST_SERVER_PASSWORD - - run: pip install --extra-index-url https://test.pypi.org/simple/ grakn-kglib + + end-to-end-test: + machine: true + working_directory: ~/kglib + steps: + - checkout + - bazel_install + - run: sudo apt-get update + - run: pyenv install 3.6.3 + - run: pyenv global 3.6.3 + - run: bazel test //examples:test_pypi_end_to_end_test --test_output=streamed --force_python PY3 --python_path $(which python) --spawn_strategy=standalone deploy-git: machine: true @@ -71,10 +81,13 @@ workflows: - test-deploy-pip: requires: - test + - end-to-end-test: + requires: + - test-deploy-pip - approve-deploy-git: type: approval requires: - - test-deploy-pip + - end-to-end-test - deploy-git: requires: - approve-deploy-git diff --git a/.gitignore b/.gitignore index 8670dbf5..7a370252 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # Bazel files bazel-* +# IntelliJ Bazel project +.ijwb/ + # Compiled class file *.class diff --git a/BUILD b/BUILD index 42eaee36..98f0d203 100644 --- a/BUILD +++ b/BUILD @@ -59,154 +59,5 @@ deploy_pip( deployment_requirement("webencodings"), deployment_requirement("six"), ], - target = ":kglib" + target = "//kglib:kglib" ) - -py_test( - name = "ordered_test", - srcs = [ - "kglib/kgcn/neighbourhood/data/sampling/ordered_test.py" - ], - deps = [ - "kglib" - ], -) - -py_test( - name = "random_sampling_test", - srcs = [ - "kglib/kgcn/neighbourhood/data/sampling/random_sampling_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "label_extraction_test", - srcs = [ - "kglib/kgcn/use_cases/attribute_prediction/label_extraction_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "metrics_test", - srcs = [ - "kglib/kgcn/models/metrics_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "tf_hub_test", - srcs = [ - "kglib/kgcn/encoder/tf_hub_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "schema_test", - srcs = [ - "kglib/kgcn/encoder/schema_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "encode_test", - srcs = [ - "kglib/kgcn/encoder/encode_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "boolean_test", - srcs = [ - "kglib/kgcn/encoder/boolean_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "data_traversal_test", - main = "traversal_test.py", - srcs = [ - "kglib/kgcn/neighbourhood/data/traversal_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "data_executor_test", - main = "executor_test.py", - srcs = [ - "kglib/kgcn/neighbourhood/data/executor_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "schema_traversal_test", - main = "traversal_test.py", - srcs = [ - "kglib/kgcn/neighbourhood/schema/traversal_test.py" - ], - deps = [ - "kglib", - ] -) - -py_test( - name = "raw_array_builder_test", - srcs = [ - "kglib/kgcn/preprocess/raw_array_builder_test.py" - ], - deps = [ - "kglib", - ] -) - -py_library( - name = "kglib", - srcs = glob(['kglib/__init__.py', 'kglib/kgcn/**/*.py']), - deps = [ - # Grakn deps - requirement('grakn'), - requirement('grpcio'), - - # TensorFlow deps - requirement('tensorflow'), - requirement('numpy'), - requirement('protobuf'), - requirement('six'), - requirement('absl-py'), - requirement('keras_applications'), - requirement('keras_preprocessing'), - requirement('gast'), - requirement('astor'), - requirement('termcolor'), - - requirement('tensorflow-hub'), - requirement('scikit-learn'), - requirement('scipy'), - ] -) \ No newline at end of file diff --git a/README.md b/README.md index 3305a6c6..34ad55ad 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Research This repository is the centre of all research projects conducted at Grakn Labs. In particular, it's focus is on the integration of machine learning with the Grakn knowledge graph. -At present this repo contains one project: [*Knowledge Graph Convolutional Networks* (KGCNs)](/kglib/kgcn). +At present this repo contains one project: [*Knowledge Graph Convolutional Networks* (KGCNs)](https://github.com/graknlabs/kglib/tree/master/kglib/kgcn). diff --git a/VERSION b/VERSION index ceab6e11..b7161198 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1 \ No newline at end of file +0.1a1 \ No newline at end of file diff --git a/WORKSPACE b/WORKSPACE index 49e7ef3f..2c8c13d4 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -14,7 +14,7 @@ git_repository( git_repository( name="graknlabs_bazel_distribution", remote="https://github.com/graknlabs/bazel-distribution", - commit="2e932a2555d1e43f75c8ee676c926399bd12f240" + commit="ebc9ae9e6d4ef0086d1c6731bf6f5f8a8f40b509" ) ## Only needed for PIP support: @@ -32,7 +32,16 @@ pip_install() pip3_import( name = "pypi_deployment_dependencies", - requirements = "//:deployment/requirements.txt", + requirements = "@graknlabs_bazel_distribution//pip:requirements.txt" ) load("@pypi_deployment_dependencies//:requirements.bzl", "pip_install") -pip_install() \ No newline at end of file +pip_install() + + +load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file") + +http_file( + name = "animaltrade_dist", + urls = ["https://github.com/graknlabs/kglib/releases/download/v0.1a1/grakn-animaltrade.zip", # TODO How to update to the latest relase each time? + ] +) \ No newline at end of file diff --git a/deployment.properties b/deployment.properties index f708ee79..43d2e19c 100644 --- a/deployment.properties +++ b/deployment.properties @@ -19,7 +19,7 @@ github.repository=grakn maven.repository-url.snapshot=http://maven.grakn.ai/nexus/content/repositories/snapshots/ maven.repository-url.release=http://maven.grakn.ai/nexus/content/repositories/releases/ -pip.repository-url.pypi=https://pypi.org/legacy/ +pip.repository-url.pypi=https://upload.pypi.org/legacy/ pip.repository-url.test=https://test.pypi.org/legacy/ npm.repository-url=https://registry.npmjs.org/ maven.packages=common,server,console,protocol,client-java \ No newline at end of file diff --git a/deployment/requirements.txt b/deployment/requirements.txt deleted file mode 100644 index 039eb9b2..00000000 --- a/deployment/requirements.txt +++ /dev/null @@ -1,16 +0,0 @@ -Pygments==2.3.1 -bleach==3.1.0 -certifi==2018.11.29 -chardet==3.0.4 -docutils==0.14 -idna==2.8 -pkginfo==1.5.0.1 -readme-renderer==24.0 -requests-toolbelt==0.8.0 -requests==2.21.0 -setuptools>=38.6.0 -tqdm==4.29.1 -twine==1.12.1 -urllib3==1.24.1 -webencodings==0.5.1 -wheel==0.32.3 \ No newline at end of file diff --git a/examples/BUILD b/examples/BUILD new file mode 100644 index 00000000..cee59bfa --- /dev/null +++ b/examples/BUILD @@ -0,0 +1,61 @@ +load("@io_bazel_rules_python//python:python.bzl", "py_library", "py_test") +load("@pypi_dependencies//:requirements.bzl", "requirement") + +py_test( + name = "test_pypi_end_to_end_test", + main = "end_to_end_test.py", + srcs = [ + "kgcn/animal_trade/test/end_to_end_test.py" + ], + deps = [ + "test-pypi-kglib", + ], + data = [ + "@animaltrade_dist//file", + ] +) + +py_test( + name = "local_end_to_end_test", + main = "end_to_end_test.py", + srcs = [ + "kgcn/animal_trade/test/end_to_end_test.py" + ], + deps = [ + "//kglib:kglib", + ], + data = [ + "@animaltrade_dist//file", + ] +) + + +py_library( + name = "test-pypi-kglib", + srcs = [ + "kgcn/animal_trade/test/end_to_end_test.py" + ], + deps = [ + requirement('grakn-kglib'), + + # Grakn deps + requirement('grakn'), + requirement('grpcio'), + + # TensorFlow deps + requirement('tensorflow'), + requirement('numpy'), + requirement('protobuf'), + requirement('six'), + requirement('absl-py'), + requirement('keras_applications'), + requirement('keras_preprocessing'), + requirement('gast'), + requirement('astor'), + requirement('termcolor'), + + requirement('tensorflow-hub'), + requirement('scikit-learn'), + requirement('scipy'), + ] +) \ No newline at end of file diff --git a/examples/kgcn/animal_trade/README.md b/examples/kgcn/animal_trade/README.md new file mode 100644 index 00000000..887d06bf --- /dev/null +++ b/examples/kgcn/animal_trade/README.md @@ -0,0 +1,75 @@ +# KGCN Example - CITES Animal Trade Data + +### Quickstart + +**Requirements:** + +- Python 3.6.3 < version < 3.7 ([tensorflow doesn't yet support Python 3.7](https://github.com/tensorflow/tensorflow/issues/17022)) +- kglib installed from pip: `pip install --extra-index-url https://test.pypi.org/simple/ grakn-kglib` +- The source code in order to access the example `git clone https://github.com/graknlabs/kglib.git` +- The `grakn-animaltrade.zip` dataset from the [latest release](https://github.com/graknlabs/kglib/releases/latest). This is a dataset that has been pre-loaded into Grakn v1.5 (so you don't have to run the data import yourself), with two keyspaces: `animaltrade_train` and `animaltrade_test` + +**To use:** + +- Prepare the data: + + - If you already have an instance of Grakn running, make sure to stop it using `./grakn server stop` + + - Download `grakn-animaltrade.zip` from the [latest release](https://github.com/graknlabs/kglib/releases/latest). This is a Grakn distribution, pre-loaded with the CITES dataset + + - Unzip the distribution `unzip grakn-animaltrade.zip`, where you store this doesn't matter + + - cd into the distribution `cd grakn-animaltrade` + + - start Grakn `./grakn server start` + + - Confirm that the training keyspace is present and contains data + + `./grakn console -k animaltrade_train` + + `match $t isa traded-item; limit 1; get;` + + and then `exit` + +- Run the `main` function of the example: + + Navigate to the root of the `kglib` repo: `cd kglib` + + Run the example: `python3 -m examples.kgcn.animal_trade.main` + + This will run the full pipeline: retrieving data, building and training a KGCN classifier + +#### Details + +The CITES dataset details exchanges of animal-based products between countries. In this example we aim to predict the value of `appendix` for a set of samples. This `appendix` can be thought of as the level of endangerment that a `traded-item` is subject to, where `1` represents the highest level of endangerment, and `3` the lowest. + +The [main](../../examples/kgcn/animal_trade/main.py) function will: + +- Search Grakn for 30 concepts (with attributes as labels) to use as the training set, 30 for the evaluation set, and 30 for the prediction set using queries such as (limiting the returned stream): + + ``` + match $e(exchanged-item: $traded-item) isa exchange, has appendix $appendix; $appendix 1; get; + ``` + + This searches for an `exchange` between countries that has an `appendix` (endangerment level) of `1`, and finds the `traded-item` that was exchanged + +- Save those labelled samples to file + +- Delete all `appendix` attributes from both `animaltrade_train` and `animaltrade_test` keyspaces. This is the label we will predict in this example, so it should not be present in Grakn otherwise the network can cheat + +- Search Grakn for the k-hop neighbours of the selected examples, and store information about them as arrays, demoted in the code as `raw_arrays`. This data is saved to file so that subsequent steps can be re-run without recomputing these data + +- Build the TensorFlow computation graph using `model.KGCN`, including a multi-class classification step and learning procedure defined by `downstream.SupervisedKGCNClassifier` + +- Feed the `raw_arrays` to the TensorFlow graph, and perform learning + +##### Re-training the model +Re-running the `main` function will make use of the `feed_dicts` previously saved to file (at `dataset/10_concepts/input`), and so will repeat `classifier.train(feed_dicts[TRAIN])`, `classifier.eval(feed_dicts[EVAL])` and `classifier.eval(feed_dicts[PREDICT])` over the exact same data as previously retrieved. Therefore, to play with the learning parameters, do so and then simply re-run `main`. + +##### Re-generating the `feed_dicts` +To re-generate the `feed_dicts`, delete the saved files in `dataset/10_concepts/input`. + +##### Picking new samples +To pick different sample concepts to use for training/evaluation/prediction you need to: +- Force the `feed-dict`s to re-generate by deleting the saved files (as above) +- Use a fresh version of `grakn-animaltrade`, since the present one has had the supervised labels deleted! \ No newline at end of file diff --git a/kglib/kgcn/examples/animal_trade/main.py b/examples/kgcn/animal_trade/main.py similarity index 100% rename from kglib/kgcn/examples/animal_trade/main.py rename to examples/kgcn/animal_trade/main.py diff --git a/kglib/kgcn/examples/animal_trade/prediction_schema.gql b/examples/kgcn/animal_trade/prediction_schema.gql similarity index 77% rename from kglib/kgcn/examples/animal_trade/prediction_schema.gql rename to examples/kgcn/animal_trade/prediction_schema.gql index 787f283b..5548397a 100644 --- a/kglib/kgcn/examples/animal_trade/prediction_schema.gql +++ b/examples/kgcn/animal_trade/prediction_schema.gql @@ -18,9 +18,9 @@ # define -endangerment-level sub attribute datatype long; -kgcn-model-version sub attribute datatype double; -prediction-score sub attribute datatype double; +endangerment-level sub attribute, datatype long; +kgcn-model-version sub attribute, datatype double; +prediction-score sub attribute, datatype double; traded-item has endangerment-level; @@ -36,11 +36,13 @@ kgcn-model sub entity, define @has-endangerment-level plays predicted-value; + + insert -$kgcn isa kgcn-model has kgcn-model-version 0.1; +$kgcn isa kgcn-model, has kgcn-model-version 0.1; -$t1 id V630904, has endangerment-level $el1 via $r1; $el1 1; (predicted-value: $r1, predicting-kgcn-model: $kgcn) isa value-prediction, has prediction-score 0.87; -$t2 id V704688, has endangerment-level $el2 via $r2; $el2 1; (predicted-value: $r2, predicting-kgcn-model: $kgcn) isa value-prediction, has prediction-score 0.71; +$t1 id V1282192, has endangerment-level $el1 via $r1; $el1 1; (predicted-value: $r1, predicting-kgcn-model: $kgcn) isa value-prediction, has prediction-score 0.87; +$t2 id V1364112, has endangerment-level $el2 via $r2; $el2 1; (predicted-value: $r2, predicting-kgcn-model: $kgcn) isa value-prediction, has prediction-score 0.71; match $t1 isa traded-item, has endangerment-level $el1 via $r1; $el1 1; $vp1(predicted-value: $r1, predicting-kgcn-model: $kgcn) isa value-prediction, has prediction-score $s1; get; @@ -60,6 +62,6 @@ when { $ti isa traded-item, has endangerment-level $el; $el 1; $ti has item-type $type; $type "meat"; $e(exchanged-item: $ti) isa exchange; -} then { +}, then { (suspicious-activity: $e, cause-of-suspicion: $type, cause-of-suspicion: $el) isa suspicious-activity-detection; }; diff --git a/kglib/kgcn/examples/animal_trade/schema.gql b/examples/kgcn/animal_trade/schema.gql similarity index 98% rename from kglib/kgcn/examples/animal_trade/schema.gql rename to examples/kgcn/animal_trade/schema.gql index e02c0b84..d89ba9ec 100644 --- a/kglib/kgcn/examples/animal_trade/schema.gql +++ b/examples/kgcn/animal_trade/schema.gql @@ -136,8 +136,8 @@ define relates originated-species; taxon-membership sub relationship, - relates member-item, - relates taxonomic-group; + relates member-item, + relates taxonomic-group; taxonomic-ranking when { diff --git a/examples/kgcn/animal_trade/test/end_to_end_test.py b/examples/kgcn/animal_trade/test/end_to_end_test.py new file mode 100644 index 00000000..4b2d9d74 --- /dev/null +++ b/examples/kgcn/animal_trade/test/end_to_end_test.py @@ -0,0 +1,145 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +import os +import subprocess as sub +import time +import unittest + +import grakn +import tensorflow as tf + +import kglib.kgcn.management.grakn as grakn_mgmt +import kglib.kgcn.management.samples as samp_mgmt +import kglib.kgcn.models.downstream as downstream +import kglib.kgcn.models.model as model +import kglib.kgcn.neighbourhood.data.sampling.random_sampling as random_sampling + +flags = tf.app.flags +FLAGS = flags.FLAGS + +# Learning params +flags.DEFINE_float('learning_rate', 0.01, 'Learning rate') +flags.DEFINE_integer('num_classes', 3, 'Number of classes') +flags.DEFINE_integer('features_length', 198, 'Number of features after encoding') +flags.DEFINE_integer('starting_concepts_features_length', 173, + 'Number of features after encoding for the nodes of interest, which excludes the features for ' + 'role_type and role_direction') +flags.DEFINE_integer('aggregated_length', 20, 'Length of aggregated representation of neighbours, a hidden dimension') +flags.DEFINE_integer('output_length', 32, 'Length of the output of "combine" operation, taking place at each depth, ' + 'and the final length of the embeddings') +flags.DEFINE_integer('max_training_steps', 50, 'Max number of gradient steps to take during gradient descent') + +# Sample selection params +EXAMPLES_QUERY = 'match $e(exchanged-item: $traded-item) isa exchange, has appendix $appendix; $appendix {}; get;' +LABEL_ATTRIBUTE_TYPE = 'appendix' +ATTRIBUTE_VALUES = [1, 2, 3] +EXAMPLE_CONCEPT_TYPE = 'traded-item' + +NUM_PER_CLASS = 5 +POPULATION_SIZE_PER_CLASS = 100 + +# Params for persisting to files +DIR = os.path.dirname(os.path.realpath(__file__)) +TIMESTAMP = time.strftime("%Y-%m-%d_%H-%M-%S") +# BASE_PATH = f'{DIR}/dataset/{NUM_PER_CLASS}_concepts/' +# flags.DEFINE_string('log_dir', BASE_PATH + 'out/out_' + TIMESTAMP, 'directory to use to store data from training') + +# SAVED_LABELS_PATH = BASE_PATH + 'labels/labels_{}.p' + +TRAIN = 'train' +EVAL = 'eval' +PREDICT = 'predict' + +KEYSPACES = { + TRAIN: "animaltrade_train", + EVAL: "animaltrade_train", + PREDICT: "animaltrade_train", +} + +URI = "localhost:48555" + +NEIGHBOUR_SAMPLE_SIZES = (2, 1) + + +class TestEndToEnd(unittest.TestCase): + + def test_end_to_end(self): + # Unzip the Grakn distribution containing our data + sub.run(['unzip', 'external/animaltrade_dist/file/downloaded', '-d', + 'external/animaltrade_dist/file/downloaded-unzipped']) + + # Start Grakn + sub.run(['external/animaltrade_dist/file/downloaded-unzipped/grakn-animaltrade/grakn', 'server', 'start']) + + modes = (TRAIN, EVAL) + + client = grakn.Grakn(uri=URI) + sessions = grakn_mgmt.get_sessions(client, KEYSPACES) + transactions = grakn_mgmt.get_transactions(sessions) + + batch_size = buffer_size = NUM_PER_CLASS * FLAGS.num_classes + kgcn = model.KGCN(NEIGHBOUR_SAMPLE_SIZES, + FLAGS.features_length, + FLAGS.starting_concepts_features_length, + FLAGS.aggregated_length, + FLAGS.output_length, + transactions[TRAIN], + batch_size, + buffer_size, + sampling_method=random_sampling.random_sample, + sampling_limit_factor=4 + ) + + optimizer = tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate) + classifier = downstream.SupervisedKGCNClassifier(kgcn, optimizer, FLAGS.num_classes, None, + max_training_steps=FLAGS.max_training_steps) + + feed_dicts = {} + + sampling_params = { + TRAIN: {'sample_size': NUM_PER_CLASS, 'population_size': POPULATION_SIZE_PER_CLASS}, + EVAL: {'sample_size': NUM_PER_CLASS, 'population_size': POPULATION_SIZE_PER_CLASS}, + PREDICT: {'sample_size': NUM_PER_CLASS, 'population_size': POPULATION_SIZE_PER_CLASS}, + } + concepts, labels = samp_mgmt.compile_labelled_concepts(EXAMPLES_QUERY, EXAMPLE_CONCEPT_TYPE, + LABEL_ATTRIBUTE_TYPE, ATTRIBUTE_VALUES, + transactions[TRAIN], transactions[PREDICT], + sampling_params) + + for mode in modes: + feed_dicts[mode] = classifier.get_feed_dict(sessions[mode], concepts[mode], labels=labels[mode]) + + # Train + if TRAIN in modes: + print("\n\n********** TRAIN Keyspace **********") + classifier.train(feed_dicts[TRAIN]) + + # Eval + if EVAL in modes: + print("\n\n********** EVAL Keyspace **********") + # Presently, eval keyspace is the same as the TRAIN keyspace + classifier.eval(feed_dicts[EVAL]) + + grakn_mgmt.close(sessions) + grakn_mgmt.close(transactions) + + +if __name__ == "__main__": + unittest.main() diff --git a/kglib/BUILD b/kglib/BUILD new file mode 100644 index 00000000..3ed6ce19 --- /dev/null +++ b/kglib/BUILD @@ -0,0 +1,153 @@ +load("@io_bazel_rules_python//python:python.bzl", "py_library", "py_test") +load("@pypi_dependencies//:requirements.bzl", "requirement") + + +py_test( + name = "ordered_test", + srcs = [ + "kgcn/neighbourhood/data/sampling/ordered_test.py" + ], + deps = [ + "kglib" + ], +) + +py_test( + name = "random_sampling_test", + srcs = [ + "kgcn/neighbourhood/data/sampling/random_sampling_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "label_extraction_test", + srcs = [ + "kgcn/use_cases/attribute_prediction/label_extraction_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "metrics_test", + srcs = [ + "kgcn/models/metrics_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "tf_hub_test", + srcs = [ + "kgcn/encoder/tf_hub_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "schema_test", + srcs = [ + "kgcn/encoder/schema_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "encode_test", + srcs = [ + "kgcn/encoder/encode_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "boolean_test", + srcs = [ + "kgcn/encoder/boolean_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "data_traversal_test", + main = "traversal_test.py", + srcs = [ + "kgcn/neighbourhood/data/traversal_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "data_executor_test", + main = "executor_test.py", + srcs = [ + "kgcn/neighbourhood/data/executor_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "schema_traversal_test", + main = "traversal_test.py", + srcs = [ + "kgcn/neighbourhood/schema/traversal_test.py" + ], + deps = [ + "kglib", + ] +) + +py_test( + name = "raw_array_builder_test", + srcs = [ + "kgcn/preprocess/raw_array_builder_test.py" + ], + deps = [ + "kglib", + ] +) + +py_library( + name = "kglib", + srcs = glob(['__init__.py', 'kgcn/**/*.py']), + deps = [ + # Grakn deps + requirement('grakn'), + requirement('grpcio'), + + # TensorFlow deps + requirement('tensorflow'), + requirement('numpy'), + requirement('protobuf'), + requirement('six'), + requirement('absl-py'), + requirement('keras_applications'), + requirement('keras_preprocessing'), + requirement('gast'), + requirement('astor'), + requirement('termcolor'), + + requirement('tensorflow-hub'), + requirement('scikit-learn'), + requirement('scipy'), + ], + visibility=['//visibility:public'] +) \ No newline at end of file diff --git a/kglib/kgcn/README.md b/kglib/kgcn/README.md index fa542c22..185b20fd 100644 --- a/kglib/kgcn/README.md +++ b/kglib/kgcn/README.md @@ -2,84 +2,68 @@ This project introduces a novel model: the *Knowledge Graph Convolutional Network* (KGCN). The principal idea of this work is to forge a bridge between knowledge graphs and machine learning, using [Grakn](https://github.com/graknlabs/grakn) as the knowledge graph. A KGCN can be used to create vector representations, *embeddings*, of any labelled set of Grakn Concepts via supervised learning. As a result, a KGCN can be trained directly for the classification or regression of Concepts stored in Grakn. Future work will include building embeddings via unsupervised learning.![KGCN Process](readme_images/KGCN_process.png) - - -## Methodology - -The ideology behind this project is described [here](https://blog.grakn.ai/knowledge-graph-convolutional-networks-machine-learning-over-reasoned-knowledge-9eb5ce5e0f68). The principles of the implementation are based on [GraphSAGE](http://snap.stanford.edu/graphsage/), from the Stanford SNAP group, made to work over a **knowledge graph**. Instead of working on a typical property graph, a KGCN learns from the context of a *typed hypergraph*, **Grakn**. Additionally, it learns from facts deduced by Grakn's *automated logical reasoner*. From this point onwards some understanding of [Grakn's docs](http://dev.grakn.ai) is assumed. - -#### How do KGCNs work? - -The purpose of this method is to derive embeddings for a set of Concepts (and thereby directly learn to classify them). We start by querying Grakn to find a set of labelled examples. Following that, we gather data about the neighbourhood of each example Concept. We do this by considering their *k-hop* neighbours. - -![k-hop neighbours](readme_images/k-hop_neighbours.png)We retrieve the data concerning this neighbourhood from Grakn. This information includes the *type hierarchy*, *roles*, and *attribute* values of each neighbouring Concept encountered. - -To create embeddings, we build a network in TensorFlow that successively aggregates and combines features from the K hops until a 'summary' representation remains - an embedding. In our example these embeddings are directly optimised to perform multi-class classification. This is achieved by passing the embeddings to a single subsequent dense layer and determining loss via softmax cross entropy with the labels retrieved. - -![Aggregation and Combination process](readme_images/aggregate_and_combine.png) - - - -## Usage by example - CITES Animal Trade Data - -### Quickstart +## Quickstart **Requirements:** -- Python 3.6.3 or higher -- kglib installed from pip: `pip install --extra-index-url https://test.pypi.org/simple/ grakn-kglib` -- The `grakn-animaltrade.zip` dataset from the [latest release](https://github.com/graknlabs/kglib/releases/latest). This is a dataset that has been pre-loaded into Grakn v1.5 (so you don't have to run the data import yourself), with two keyspaces: `animaltrade_train` and `animaltrade_test`. - -**To use:** +- Python 3.6.3 < version < 3.7 ([tensorflow doesn't yet support Python 3.7](https://github.com/tensorflow/tensorflow/issues/17022)) -- Prepare the data: +- kglib installed from pip: `pip install --extra-index-url https://test.pypi.org/simple/ grakn-kglib` - - If you already have an insatnce of Grakn running, make sure to stop it using `./grakn server stop` - - - Download the pre-loaded Grakn distribution from the [latest release](https://github.com/graknlabs/kglib/releases/latest) +### Usage - - Unzip the distribution `unzip grakn-animaltrade.zip `, where you store this doesn't matter +The following is a template of what must be defined in order to instantiate a KGCN, optimised for a downstream learning task of multi-class classification: - - cd into the distribution `cd grakn-animaltrade` - - - start Grakn `./grakn server start` +```python +import kglib.kgcn.models as models +import tensorflow as tf +import grakn - - Confirm that the training keyspace is present and contains data +URI = "localhost:48555" - `./grakn console -k animaltrade_train` +client = grakn.Grakn(uri=URI) +session = client.session(keyspace=training_keyspace) +transaction = session.transaction(grakn.TxType.WRITE) - `match $t isa traded-item; limit 1; get;` +kgcn = models.model.KGCN(neighbour_sample_sizes, + features_length, + starting_concepts_features_length, + aggregated_length, + output_length, + transaction, + batch_size, + buffer_size + ) - and then `exit` +optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) +classifier = models.downstream.SupervisedKGCNClassifier(kgcn, optimizer, num_classes, log_dir, + max_training_steps=max_training_steps) -- Run the `main` function of the example: +training_feed_dict = classifier.get_feed_dict(session, training_concepts, labels=training_labels) - `cd kglib` +classifier.train(training_feed_dict) - `python3 -m kglib.kgcn.examples.animal_trade.main` +transaction.close() +session.close() +``` - This will run the full pipeline: retrieving data, building and training a KGCN classifier +There is also a [full example](https://github.com/graknlabs/kglib/examples/kgcn/animal_trade) which outlines retrieving sample concepts with labels and working with separate keyspaces for training and testing. -#### Details +## Methodology -The CITES dataset details exchanges of animal-based products between countries. In this example we aim to predict the value of `appendix` for a set of samples. This `appendix` can be thought of as the level of endangerment that a `traded-item` is subject to, where `1` represents the highest level of endangerment, and `3` the lowest. +The ideology behind this project is described [here](https://blog.grakn.ai/knowledge-graph-convolutional-networks-machine-learning-over-reasoned-knowledge-9eb5ce5e0f68), and a [video of the presentation](https://youtu.be/Jx_Twc75ka0?t=368). The principles of the implementation are based on [GraphSAGE](http://snap.stanford.edu/graphsage/), from the Stanford SNAP group, made to work over a **knowledge graph**. Instead of working on a typical property graph, a KGCN learns from the context of a *typed hypergraph*, **Grakn**. Additionally, it learns from facts deduced by Grakn's *automated logical reasoner*. From this point onwards some understanding of [Grakn's docs](http://dev.grakn.ai) is assumed. -The [main](examples/animal_trade/main.py) function will: +#### How do KGCNs work? -- Search Grakn for 30 concepts (with attributes as labels) to use as the training set, 30 for the evaluation set, and 30 for the prediction set using queries such as (limiting the returned stream): +The purpose of this method is to derive embeddings for a set of Concepts (and thereby directly learn to classify them). We start by querying Grakn to find a set of labelled examples. Following that, we gather data about the neighbourhood of each example Concept. We do this by considering their *k-hop* neighbours. - ``` - match $e(exchanged-item: $traded-item) isa exchange, has appendix $appendix; $appendix 1; get; - ``` +![k-hop neighbours](readme_images/k-hop_neighbours.png)We retrieve the data concerning this neighbourhood from Grakn. This information includes the *type hierarchy*, *roles*, and *attribute* values of each neighbouring Concept encountered. - This searches for an `exchange` between countries that has an `appendix` (endangerment level) of `1`, and finds the `traded-item` that was exchanged +To create embeddings, we build a network in TensorFlow that successively aggregates and combines features from the K hops until a 'summary' representation remains - an embedding. In our example these embeddings are directly optimised to perform multi-class classification. This is achieved by passing the embeddings to a single subsequent dense layer and determining loss via softmax cross entropy with the labels retrieved. -- Save those labelled samples to file +![Aggregation and Combination process](readme_images/aggregate_and_combine.png) -- Delete all `appendix` attributes from both `animaltrade_train` and `animaltrade_test` keyspaces. This is the label we will predict in this example, so it should not be present in Grakn otherwise the network can cheat -- Search Grakn for the k-hop neighbours of the selected examples, and store information about them as arrays, demoted in the code as `raw_arrays`. This data is saved to file so that subsequent steps can be re-run without recomputing these data -- Build the TensorFlow computation graph using `model.KGCN`, including a multi-class classification step and learning procedure defined by `downstream.SupervisedKGCNClassifier` + -- Feed the `raw_arrays` to the TensorFlow graph, and performs learning \ No newline at end of file diff --git a/kglib/kgcn/examples/toy/main.py b/kglib/kgcn/examples/toy/main.py deleted file mode 100644 index ebad6dff..00000000 --- a/kglib/kgcn/examples/toy/main.py +++ /dev/null @@ -1,107 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -import time - -import grakn -import numpy as np -import tensorflow as tf - -import kglib.kgcn.models.model as model -import kglib.kgcn.neighbourhood.data.sampling.ordered as ordered -import kglib.kgcn.neighbourhood.data.sampling.sampler as samp -import kgcn.neighbourhood.schema.strategy as schema_strat - -flags = tf.app.flags -FLAGS = flags.FLAGS - -flags.DEFINE_boolean('debug', False, 'Enable debugging') -flags.DEFINE_float('learning_rate', 0.05, 'Learning rate') -flags.DEFINE_integer('classes_length', 3, 'Number of classes') -flags.DEFINE_integer('features_length', 192, 'Number of features after encoding') -flags.DEFINE_integer('starting_concepts_features_length', 4, ## 143, - 'Number of features after encoding for the nodes of interest, which excludes the features for ' - 'role_type and role_direction') -flags.DEFINE_integer('aggregated_length', 4, 'Length of aggregated representation of neighbours, a hidden dimension') -flags.DEFINE_integer('output_length', 4, 'Length of the output of "combine" operation, taking place at each depth, ' - 'and the final length of the embeddings') - -flags.DEFINE_integer('max_training_steps', 2500, 'Max number of gradient steps to take during gradient descent') - -TIMESTAMP = time.strftime("%Y-%m-%d_%H-%M-%S") -flags.DEFINE_string('log_dir', './out/out_' + TIMESTAMP, 'directory to use to store data from training') - - -def main(): - keyspace = 'toy' - uri = "localhost:48555" - - client = grakn.Grakn(uri=uri) - train_session = client.session(keyspace=keyspace) - tx = train_session.transaction(grakn.TxType.WRITE) - - label_types = ['A', 'B', 'C'] - - concepts = [] - labels = [] - - for label_type in label_types: - - target_concept_query = f"match $x($label) isa example; $label isa {label_type}; get;" - - answers = tx.query(target_concept_query) - new_concepts = [ans.get('x') for ans in answers] - - base_label = [0, 0, 0] - base_label[label_types.index(label_type)] = 1 - - concepts += new_concepts - labels += [base_label for _ in new_concepts] - - labels = np.array(labels, dtype=np.float32) - - neighbour_sample_sizes = (1,) - - sampling_method = ordered.ordered_sample - - samplers = [] - for sample_size in neighbour_sample_sizes: - samplers.append(samp.Sampler(sample_size, sampling_method, limit=sample_size + 1)) - - # Strategies - role_schema_strategy = schema_strat.SchemaRoleTraversalStrategy(include_implicit=False, include_metatypes=False) - thing_schema_strategy = schema_strat.SchemaThingTraversalStrategy(include_implicit=False, include_metatypes=False) - - traversal_strategies = {'role': role_schema_strategy, - 'thing': thing_schema_strategy} - - kgcn = model.KGCN(tx, traversal_strategies, samplers, features_to_exclude=['neighbour_data_type', - 'neighbour_value_long', - 'neighbour_value_double', - 'neighbour_value_boolean', - 'neighbour_value_date', - 'neighbour_value_string']) - - kgcn.train(tx, concepts, labels) - kgcn.evaluate(tx, concepts, labels) - # kgcn.predict(tx, concepts) - - -if __name__ == "__main__": - main() diff --git a/kglib/kgcn/examples/toy/schema.gql b/kglib/kgcn/examples/toy/schema.gql deleted file mode 100644 index 6f5d581b..00000000 --- a/kglib/kgcn/examples/toy/schema.gql +++ /dev/null @@ -1,39 +0,0 @@ -define -example sub relationship, - relates label-a, - relates label-b, - relates label-c; - -A sub entity, - plays label-a; - -B sub entity, - plays label-b; - -C sub entity, - plays label-c; - -insert -$e1(label-a: $a) isa example; $a isa A; -$e2(label-b: $b) isa example; $b isa B; -$e3(label-c: $c) isa example; $c isa C; - -insert -$e1(label-a: $a) isa example; $a isa A; -$e2(label-b: $b) isa example; $b isa B; -$e3(label-c: $c) isa example; $c isa C; - -insert -$e1(label-a: $a) isa example; $a isa A; -$e2(label-b: $b) isa example; $b isa B; -$e3(label-c: $c) isa example; $c isa C; - -insert -$e1(label-a: $a) isa example; $a isa A; -$e2(label-b: $b) isa example; $b isa B; -$e3(label-c: $c) isa example; $c isa C; - -insert -$e1(label-a: $a) isa example; $a isa A; -$e2(label-b: $b) isa example; $b isa B; -$e3(label-c: $c) isa example; $c isa C; diff --git a/kglib/kgcn/models/downstream.py b/kglib/kgcn/models/downstream.py index d2595381..cb2f83a4 100644 --- a/kglib/kgcn/models/downstream.py +++ b/kglib/kgcn/models/downstream.py @@ -33,6 +33,7 @@ def __init__(self, kgcn: kglib.kgcn.models.model.KGCN, optimizer, num_classes, l classification_kernel_initializer=tf.contrib.layers.xavier_initializer()): self._log_dir = log_dir + self._write_summary = self._log_dir is not None self._kgcn = kgcn self._optimizer = optimizer self._num_classes = num_classes @@ -92,7 +93,8 @@ def __init__(self, kgcn: kglib.kgcn.models.model.KGCN, optimizer, num_classes, l init_tables = tf.tables_initializer() # Instantiate a SummaryWriter to output summaries and the Graph. - self.summary_writer = tf.summary.FileWriter(self._log_dir, self.tf_session.graph) + if self._write_summary: + self.summary_writer = tf.summary.FileWriter(self._log_dir, self.tf_session.graph) # Run the Op to initialize the variables. self.tf_session.run(init_global) @@ -136,8 +138,9 @@ def train(self, feed_dict): self._predictions_class_winners, self._labels_winners]) summary_str = self.tf_session.run(self.summary, feed_dict=feed_dict) - self.summary_writer.add_summary(summary_str, step) - self.summary_writer.flush() + if self._write_summary: + self.summary_writer.add_summary(summary_str, step) + self.summary_writer.flush() if step % int(self._max_training_steps / 20) == 0: print(f'\n-----') print(f'Step {step}') diff --git a/kglib/kgcn/refactor.md b/kglib/kgcn/refactor.md deleted file mode 100644 index 9abb4cfa..00000000 --- a/kglib/kgcn/refactor.md +++ /dev/null @@ -1,104 +0,0 @@ - -1. Customise the learning parameters, in some clear way, including: - - numerical parameters - - strategies - - encoders - -*Pre-TensorFlow model* -These components are used repeatedly for each neighbour hop, but could require different parameters each -neighbour sampling params: - query limits, - sampling nature e.g. ordered, pseudo-random, random - -*Within Tensorflow model* -Name scoping? - -encoding parameters - encoder to user per-type - -normalisation parameters(?) - -These components are used repeatedly for each neighbour hop, but could require different parameters each - Aggregator parameters - Weight initialiser - Bias - Weight regulariser - Activation - Dropout - Layer Type (currently dense) - Reduction method - Combination parameters - Weight initialiser - Weight regularisers - - Normaliser parameters - -Loss method - -kgcn = KGCN(traversal_params={}, aggregation_params={'bias': False}, combination_params={}) -Any arguments provided here should override the default dict params - - - - -2. Direct supervised learning for: - - Unknown downstream learning (arbitrary user pipeline), with ready-made components: - - Attribute prediction - - Link prediction - -3. Generate unsupervised embeddings, subsequently perform either: - - Unknown downstream learning (arbitrary user pipeline), with ready-made components: - - Attribute prediction - - Link prediction - -4. Visualise the model/learning in TensorBoard - -5. Save the traversal output arrays to file in order to quickly iterate on the learning model - -6. Advanced: Customise the neural net design - -7. Low priority: Support learning outside TensorFlow, using other libraries etc - - - - -2. + 3. - -traverser = Traverser(params) - -traversals = traverser.traverse(concepts) - -# Do any saving/loading of traversals and labels to/from file - -embedder = Embedder(params) # This is agnostic to training, evaluation, prediction etc - -# Get the output tensors, e.g. embeddings, summary writers, initialisers etc -output_tensors = embedder.build() - -classifier = SimpleMultiClassClassifier() - -kgcn = KGCN(params) - -# Create embeddings tensor -embeddings = kgcn.get_embeddings() - -udl = UnknownDownstreamLearning(embeddings) -udl.train(concepts, labels, grakn_connection) - - -kgcn = SupervisedKGCNClassifier(params) -kgcn = KGCNMultiClassClassifier(Embedder(params), classifier_params) -kgcn = KGCN(params) -train_results = kgcn.train(concepts, labels) -eval_results = kgcn.eval(concepts, labels) -predictions = kgcn.predict(concepts, labels) - - - - - - - - - - diff --git a/requirements.txt b/requirements.txt index 8055a0a7..d6609fdb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ +--extra-index-url https://testpypi.python.org/pypi +grakn-kglib absl-py==0.5.0 astor==0.7.1 decorator==4.3.0