Skip to content
This repository has been archived by the owner on Nov 18, 2023. It is now read-only.

Commit

Permalink
Grakn 1.5 migration (#58)
Browse files Browse the repository at this point in the history
## What is the goal of this PR?

- Update the documentation for KGCN
- Migrate to use Grakn commit 20750ca0a46b4bc252ad81edccdfd8d8b7c46caa and Python grakn-client commit 5459d5d88a30631c5ebdac3a9b0d5ea6f184c8ae

## What are the changes implemented in this PR?

- KGCN README improvements, corrections, fixes including updated diagrams
- CI updates to use Grakn distributions hosted on GCP for unit, integration and end-to-end tests
  • Loading branch information
jmsfltchr authored Mar 5, 2019
1 parent 02015f5 commit 26303e1
Show file tree
Hide file tree
Showing 21 changed files with 83 additions and 76 deletions.
8 changes: 4 additions & 4 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ jobs:
- run: sudo apt-get update
- run: pyenv install 3.6.3
- run: pyenv global 3.6.3
- run: wget https://github.com/graknlabs/grakn/releases/download/v1.4.3/grakn-core-1.4.3.zip
- run: unzip grakn-core-1.4.3.zip
- run: nohup grakn-core-1.4.3/grakn server start
- run: grakn-core-1.4.3/graql console -k test_schema -f kglib/kgcn/test_data/schema.gql
- run: wget https://storage.googleapis.com/kglib/grakn-core-all-20750ca0a46b4bc252ad81edccdfd8d8b7c46caa.zip
- run: unzip grakn-core-all-20750ca0a46b4bc252ad81edccdfd8d8b7c46caa.zip
- run: nohup grakn-core-all/grakn server start
- run: cd grakn-core-all && ./grakn console -k test_schema -f ../kglib/kgcn/test_data/schema.gql
- run: bazel test //kglib/... --test_output=streamed --force_python PY3 --python_path $(which python)

test-deploy-pip:
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.1
0.1a3
2 changes: 1 addition & 1 deletion WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,6 @@ load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_file")

http_file(
name = "animaltrade_dist",
urls = ["https://github.com/graknlabs/kglib/releases/download/v0.1a1/grakn-animaltrade.zip", # TODO How to update to the latest relase each time?
urls = ["https://storage.googleapis.com/kglib/grakn-core-animaltrade-20750ca0a46b4bc252ad81edccdfd8d8b7c46caa.zip", # TODO How to update to the latest relase each time?
]
)
2 changes: 1 addition & 1 deletion examples/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ py_library(
requirement('grakn-kglib'),

# Grakn deps
requirement('grakn'),
requirement('grakn-client'),
requirement('grpcio'),

# TensorFlow deps
Expand Down
4 changes: 2 additions & 2 deletions examples/kgcn/animal_trade/prediction_schema.gql
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ prediction-score sub attribute, datatype double;

traded-item has endangerment-level;

value-prediction sub relationship,
value-prediction sub relation,
has prediction-score,
relates predicted-value,
relates predicting-kgcn-model;
Expand All @@ -49,7 +49,7 @@ match $t1 isa traded-item, has endangerment-level $el1 via $r1; $el1 1; $vp1(pre

define

suspicious-activity-detection sub relationship,
suspicious-activity-detection sub relation,
relates suspicious-activity,
relates cause-of-suspicion;

Expand Down
12 changes: 6 additions & 6 deletions examples/kgcn/animal_trade/schema.gql
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ define
has unit-of-measurement,
plays quantification-measurement;

exchange sub relationship,
exchange sub relation,
relates receiving-country,
relates providing-country,
relates exchanged-item,
Expand All @@ -80,11 +80,11 @@ define
relates imported-item as exchanged-item,
plays corresponding-import;

import-export-correspondence sub relationship,
import-export-correspondence sub relation,
relates corresponding-import,
relates corresponding-export;

quantification sub relationship,
quantification sub relation,
relates quantified-subject,
relates quantification-measurement;

Expand All @@ -111,7 +111,7 @@ define
plays originated-species,
plays sub-taxon;

hierarchy sub relationship,
hierarchy sub relation,
relates superior,
relates subordinate;

Expand All @@ -131,11 +131,11 @@ define
relates containing-continent as container,
relates contained-country as containee;

species-origination sub relationship,
species-origination sub relation,
relates originating-country,
relates originated-species;

taxon-membership sub relationship,
taxon-membership sub relation,
relates member-item,
relates taxonomic-group;

Expand Down
2 changes: 1 addition & 1 deletion examples/kgcn/animal_trade/test/end_to_end_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_end_to_end(self):
'external/animaltrade_dist/file/downloaded-unzipped'])

# Start Grakn
sub.run(['external/animaltrade_dist/file/downloaded-unzipped/grakn-animaltrade/grakn', 'server', 'start'])
sub.run(['external/animaltrade_dist/file/downloaded-unzipped/grakn-core-animaltrade-1.5.0/grakn', 'server', 'start'])

modes = (TRAIN, EVAL)

Expand Down
2 changes: 1 addition & 1 deletion kglib/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ py_library(
srcs = glob(['__init__.py', 'kgcn/**/*.py']),
deps = [
# Grakn deps
requirement('grakn'),
requirement('grakn-client'),
requirement('grpcio'),

# TensorFlow deps
Expand Down
33 changes: 20 additions & 13 deletions kglib/kgcn/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ A KGCN can be used to create vector representations, *embeddings*, of any labell

Often, data doesn't fit well into a tabular format. There are many benefits to storing complex and interrelated data in a knowledge graph, not least that the context of each datapoint can be stored in full.

However, many existing machine learning techniques rely upon an *input vector for each example*. This can make it difficult to directly apply many conventional machine learning techniques over a knowledge graph.
However, many existing machine learning techniques rely upon the existence of an *input vector for each example*. Creating such a vector to represent a node in a knowledge graph is non-trivial.

In order to make use of the wealth of existing ideas, tools and pipelines in machine learning, we need a method of building a vector to describe a datapoint in a knowledge graph. In this way we can leverage contextual information from a knowledge graph for machine learning.
In order to make use of the wealth of existing ideas, tools and pipelines in machine learning, we need a method of building these vectors. In this way we can leverage contextual information from a knowledge graph for machine learning.

This is what a KGCN can achieve. Given an example datapoint taken from a knowledge graph, it can examine the nodes in the vicinity of an example, its *context*. Based on this context it can determine a vector representation, an *embedding*, for that example.
This is what a KGCN can achieve. Given an example node in a knowledge graph, it can examine the nodes in the vicinity of that example, its *context*. Based on this context it can determine a vector representation, an *embedding*, for that example.

**There are two broad learning tasks a KGCN is suitable for:**

**1. Supervised learning from a knowledge graph for prediction e.g. multi-class classification (currently implemented), regression, link prediction**
**1. Supervised learning from a knowledge graph for prediction e.g. multi-class classification (implemented), regression, link prediction**
**2. Unsupervised creation of Knowledge Graph Embeddings, e.g. for clustering and node comparison tasks**

![KGCN Process](readme_images/KGCN_process.png)
Expand All @@ -46,7 +46,8 @@ In order to build a *useful* representation, a KGCN needs to perform some learni
The following is a template of what must be defined in order to instantiate a KGCN, optimised for a downstream learning task of multi-class classification:

```python
import kglib.kgcn.embed.model as model
import kglib.kgcn.core.model as model
import kglib.kgcn.learn.classify as classify
import tensorflow as tf
import grakn

Expand All @@ -65,10 +66,16 @@ kgcn = model.KGCN(neighbour_sample_sizes,
batch_size)

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
classifier = learn.classify.SupervisedKGCNClassifier(kgcn, optimizer, num_classes, log_dir,
max_training_steps=max_training_steps)

training_feed_dict = classifier.get_feed_dict(session, training_things, labels=training_labels)
classifier = classify.SupervisedKGCNClassifier(kgcn,
optimizer,
num_classes,
log_dir,
max_training_steps=max_training_steps)

training_feed_dict = classifier.get_feed_dict(session,
training_things,
labels=training_labels)

classifier.train(training_feed_dict)

Expand All @@ -80,17 +87,17 @@ There is also a [full example](https://github.com/graknlabs/kglib/tree/master/ex

## Methodology

The ideology behind this project is described [here](https://blog.grakn.ai/knowledge-graph-convolutional-networks-machine-learning-over-reasoned-knowledge-9eb5ce5e0f68), and a [video of the presentation](https://youtu.be/Jx_Twc75ka0?t=368). The principles of the implementation are based on [GraphSAGE](http://snap.stanford.edu/graphsage/), from the Stanford SNAP group, heavily adapted to work over a knowledge graph. Instead of working on a typical property graph, a KGCN learns from the context of a *typed hypergraph*, **Grakn**. Additionally, it learns from facts deduced by Grakn's *automated logical reasoner*. From this point onwards some understanding of [Grakn's docs](http://dev.grakn.ai) is assumed.
The ideology behind this project is described [here](https://blog.grakn.ai/knowledge-graph-convolutional-networks-machine-learning-over-reasoned-knowledge-9eb5ce5e0f68), and a [video of the presentation](https://youtu.be/Jx_Twc75ka0?t=368). The principles of the implementation are based on [GraphSAGE](http://snap.stanford.edu/graphsage/), from the Stanford SNAP group, heavily adapted to work over a knowledge graph. Instead of working on a typical property graph, a KGCN learns from contextual data stored in a *typed hypergraph*, **Grakn**. Additionally, it learns from facts deduced by Grakn's *automated logical reasoner*. From this point onwards some understanding of [Grakn's docs](http://dev.grakn.ai) is assumed.

Now we introduce the key components and how they interact.

### KGCN

A KGCN is responsible for deriving embeddings for a set of Things (and thereby directly learn to classify them). We start by querying Grakn to find a set of labelled examples. Following that, we gather data about the context of each example Thing. We do this by considering their *k-hop* neighbours.
A KGCN is responsible for deriving embeddings for a set of Things (and thereby directly learn to classify them). We start by querying Grakn to find a set of labelled examples. Following that, we gather data about the context of each example Thing. We do this by considering their neighbours, and their neighbours' neighbours, recursively, up to K hops away.

![methodology](readme_images/methodology.png)We retrieve the data concerning this neighbourhood from Grakn (diagram above). This information includes the *type hierarchy*, *roles*, and *attribute* values of each neighbouring Thing encountered, and any inferred neighbours (represented above by dotted lines).
![methodology](readme_images/methodology.png)We retrieve the data concerning this neighbourhood from Grakn (diagram above). This information includes the *type hierarchy*, *roles*, and *attribute value* of each neighbouring Thing encountered, and any inferred neighbours (represented above by dotted lines). This data is compiled into arrays to be ingested by a neural network.

Via operations Aggregate and Combine, a single vector representation is built for a Thing. This process can be chained recursively over k-hops of neighbouring Things. This builds a representation for a Thing of interest that contains information extracted from a wide context.
Via operations Aggregate and Combine, a single vector representation is built for a Thing. This process can be chained recursively over *K* hops of neighbouring Things. This builds a representation for a Thing of interest that contains information extracted from a wide context.

![chaining](readme_images/chaining.png)

Expand All @@ -104,7 +111,7 @@ In order to feed a TensorFlow neural network, we need regular array structures o

- Id
- Type
- Meta-Type (either Entity or Relationship or Attribute)
- Meta-Type (either Entity or Relation or Attribute)
- Data-type (if it's an attribute)
- Value (if it's an attribute)
- The Role that connects the example to that neighbour
Expand Down
2 changes: 1 addition & 1 deletion kglib/kgcn/core/ingest/encode/encode.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def __init__(self, schema_tx):
"https://tfhub.dev/google/nnlm-en-dim128-with-normalization/1", 128)

data_types = list(neighbour.DATA_TYPE_NAMES)
data_types.insert(0, NO_DATA_TYPE) # For the case where an entity or relationship is encountered
data_types.insert(0, NO_DATA_TYPE) # For the case where an entity or relation is encountered
data_types_traversal = {data_type: data_types for data_type in data_types}

# Later a hierarchy could be added to data_type meaning. e.g. long and double are both numeric
Expand Down
2 changes: 1 addition & 1 deletion kglib/kgcn/core/ingest/traverse/data/context/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def _traverse_from_thing(self, starting_thing: neighbour.Thing, depth: int, tx):
sampler = self._depth_samplers[-depth]
next_depth = depth - 1

# Any concept could play a role in a relationship if the schema permits it
# Any concept could play a role in a relation if the schema permits it
# Distinguish the concepts found as roles-played
connections = self._neighbour_finder.find(starting_thing.id, tx)

Expand Down
8 changes: 4 additions & 4 deletions kglib/kgcn/core/ingest/traverse/data/context/builder_mocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,15 @@ def mock_traversal_output():
neighbour.Thing("0", "person", "entity"),
[
builder.Neighbour("employee", neighbour.TARGET_PLAYS, builder.ThingContext(
neighbour.Thing("1", "employment", "relationship"),
neighbour.Thing("1", "employment", "relation"),
[
builder.Neighbour("employer", neighbour.NEIGHBOUR_PLAYS, builder.ThingContext(
neighbour.Thing("2", "company", "entity"), []
)),
]
)),
builder.Neighbour("@has-name-owner", neighbour.TARGET_PLAYS, builder.ThingContext(
neighbour.Thing("3", "@has-name", "relationship"),
neighbour.Thing("3", "@has-name", "relation"),
[
builder.Neighbour("@has-name-value", neighbour.NEIGHBOUR_PLAYS, builder.ThingContext(
neighbour.Thing("4", "name", "attribute", data_type='string', value="Employee Name"),
Expand Down Expand Up @@ -67,8 +67,8 @@ def find(self, thing_id, tx):

role_direction = neighbour.TARGET_PLAYS
yield from gen([
_build_data("employee", role_direction, "1", "employment", "relationship"),
_build_data("@has-name-owner", role_direction, "3", "@has-name", "relationship")
_build_data("employee", role_direction, "1", "employment", "relation"),
_build_data("@has-name-owner", role_direction, "3", "@has-name", "relation")
])

elif thing_id == "1":
Expand Down
Loading

0 comments on commit 26303e1

Please sign in to comment.