From 352aef13e396e3ecaaf00a479b5ee930ad18d7a3 Mon Sep 17 00:00:00 2001
From: James Fletcher <38212943+jmsfltchr@users.noreply.github.com>
Date: Wed, 29 May 2019 14:57:50 +0100
Subject: [PATCH] Refactor Classifiers (#64)

We have removed repeated code among the supervised classifiers
---
 examples/kgcn/animal_trade/README.md          |   2 +-
 examples/kgcn/animal_trade/main.py            |   4 +-
 .../kgcn/animal_trade/test/end_to_end_test.py |  69 ++++-
 kglib/kgcn/README.md                          |   2 +-
 kglib/kgcn/learn/classify.py                  | 272 +++++-------------
 5 files changed, 140 insertions(+), 209 deletions(-)

diff --git a/examples/kgcn/animal_trade/README.md b/examples/kgcn/animal_trade/README.md
index 865f9179..65a8d2c2 100644
--- a/examples/kgcn/animal_trade/README.md
+++ b/examples/kgcn/animal_trade/README.md
@@ -61,7 +61,7 @@ The [main](../../../examples/kgcn/animal_trade/main.py) function will:
 
 - Search Grakn for the k-hop neighbours of the selected examples, and store information about them as arrays, denoted in the code as `context_arrays`. This data is saved to file so that subsequent steps can be re-run without recomputing these data
 
-- Build the TensorFlow computation graph using `model.KGCN`, including a multi-class classification step and learning procedure defined by `classify.SupervisedKGCNClassifier`
+- Build the TensorFlow computation graph using `model.KGCN`, including a multi-class classification step and learning procedure defined by `classify.SupervisedKGCNMultiClassSingleLabelClassifier`
 
 - Feed the `context_arrays` to the TensorFlow graph, and perform learning
 
diff --git a/examples/kgcn/animal_trade/main.py b/examples/kgcn/animal_trade/main.py
index d488a855..095b0348 100644
--- a/examples/kgcn/animal_trade/main.py
+++ b/examples/kgcn/animal_trade/main.py
@@ -99,8 +99,8 @@ def main(modes=(TRAIN, EVAL, PREDICT)):
                       neighbour_sampling_limit_factor=4)
 
     optimizer = tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate)
-    classifier = classify.SupervisedKGCNClassifier(kgcn, optimizer, FLAGS.num_classes, FLAGS.log_dir,
-                                                   max_training_steps=FLAGS.max_training_steps)
+    classifier = classify.SupervisedKGCNMultiClassSingleLabelClassifier(kgcn, optimizer, FLAGS.num_classes, FLAGS.log_dir,
+                                                                        max_training_steps=FLAGS.max_training_steps)
 
     feed_dicts = {}
     feed_dict_storer = persistence.FeedDictStorer(BASE_PATH + 'input/')
diff --git a/examples/kgcn/animal_trade/test/end_to_end_test.py b/examples/kgcn/animal_trade/test/end_to_end_test.py
index fd003c9c..00f0cc85 100644
--- a/examples/kgcn/animal_trade/test/end_to_end_test.py
+++ b/examples/kgcn/animal_trade/test/end_to_end_test.py
@@ -76,7 +76,9 @@
 
 class TestEndToEnd(unittest.TestCase):
 
-    def test_end_to_end(self):
+    @classmethod
+    def setUpClass(cls):
+
         # Unzip the Grakn distribution containing our data
         sub.run(['unzip', 'external/animaltrade_dist/file/downloaded', '-d',
                           'external/animaltrade_dist/file/downloaded-unzipped'])
@@ -84,6 +86,66 @@ def test_end_to_end(self):
         # Start Grakn
         sub.run(['external/animaltrade_dist/file/downloaded-unzipped/grakn-core-all-mac-animaltrade1.5.3/grakn', 'server', 'start'])
 
+    def test_multi_class_single_label_classification_end_to_end(self):
+        tf.reset_default_graph()
+
+        modes = (TRAIN, EVAL)
+
+        client = grakn.client.GraknClient(uri=URI)
+        sessions = server_mgmt.get_sessions(client, KEYSPACES)
+        transactions = server_mgmt.get_transactions(sessions)
+
+        batch_size = NUM_PER_CLASS * FLAGS.num_classes
+        kgcn = model.KGCN(NEIGHBOUR_SAMPLE_SIZES,
+                          FLAGS.features_size,
+                          FLAGS.starting_concepts_features_size,
+                          FLAGS.aggregated_size,
+                          FLAGS.embedding_size,
+                          transactions[TRAIN],
+                          batch_size,
+                          neighbour_sampling_method=random_sampling.random_sample,
+                          neighbour_sampling_limit_factor=4)
+
+        optimizer = tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate)
+        classifier = classify.SupervisedKGCNMultiClassSingleLabelClassifier(kgcn, optimizer, FLAGS.num_classes, None,
+                                                                            max_training_steps=FLAGS.max_training_steps)
+
+        feed_dicts = {}
+
+        sampling_params = {
+            TRAIN: {'sample_size': NUM_PER_CLASS, 'population_size': POPULATION_SIZE_PER_CLASS},
+            EVAL: {'sample_size': NUM_PER_CLASS, 'population_size': POPULATION_SIZE_PER_CLASS},
+            PREDICT: {'sample_size': NUM_PER_CLASS, 'population_size': POPULATION_SIZE_PER_CLASS},
+        }
+        concepts, labels = thing_mgmt.compile_labelled_concepts(EXAMPLES_QUERY, EXAMPLE_CONCEPT_TYPE,
+                                                                LABEL_ATTRIBUTE_TYPE, ATTRIBUTE_VALUES,
+                                                                transactions[TRAIN], transactions[PREDICT],
+                                                                sampling_params)
+
+        for mode in modes:
+            mode_labels = labels[mode]
+            feed_dicts[mode] = classifier.get_feed_dict(sessions[mode], concepts[mode], labels=mode_labels)
+
+        # Note: The ground-truth attribute labels haven't been removed from Grakn, so the results found here are
+        # invalid, and used as an end-to-end test only
+
+        # Train
+        if TRAIN in modes:
+            print("\n\n********** TRAIN Keyspace **********")
+            classifier.train(feed_dicts[TRAIN])
+
+        # Eval
+        if EVAL in modes:
+            print("\n\n********** EVAL Keyspace **********")
+            # Presently, eval keyspace is the same as the TRAIN keyspace
+            classifier.eval(feed_dicts[EVAL])
+
+        server_mgmt.close(sessions)
+        server_mgmt.close(transactions)
+
+    def test_multi_class_multi_label_classification_end_to_end(self):
+        tf.reset_default_graph()
+
         modes = (TRAIN, EVAL)
 
         client = grakn.client.GraknClient(uri=URI)
@@ -102,7 +164,7 @@ def test_end_to_end(self):
                           neighbour_sampling_limit_factor=4)
 
         optimizer = tf.train.GradientDescentOptimizer(learning_rate=FLAGS.learning_rate)
-        classifier = classify.SupervisedKGCNClassifier(kgcn, optimizer, FLAGS.num_classes, None,
+        classifier = classify.SupervisedKGCNMultiClassMultiLabelClassifier(kgcn, optimizer, FLAGS.num_classes, None,
                                                        max_training_steps=FLAGS.max_training_steps)
 
         feed_dicts = {}
@@ -118,7 +180,8 @@ def test_end_to_end(self):
                                                                 sampling_params)
 
         for mode in modes:
-            feed_dicts[mode] = classifier.get_feed_dict(sessions[mode], concepts[mode], labels=labels[mode])
+            mode_labels = [[0, 1, 1]] * len(labels[mode])
+            feed_dicts[mode] = classifier.get_feed_dict(sessions[mode], concepts[mode], labels=mode_labels)
 
         # Note: The ground-truth attribute labels haven't been removed from Grakn, so the results found here are
         # invalid, and used as an end-to-end test only
diff --git a/kglib/kgcn/README.md b/kglib/kgcn/README.md
index 02c20a23..3e844d29 100644
--- a/kglib/kgcn/README.md
+++ b/kglib/kgcn/README.md
@@ -69,7 +69,7 @@ kgcn = model.KGCN(neighbour_sample_sizes,
 
 optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
 
-classifier = classify.SupervisedKGCNClassifier(kgcn,
+classifier = classify.SupervisedKGCNMultiClassSingleLabelClassifier(kgcn,
                                                optimizer, 
                                                num_classes, 
                                                log_dir,
diff --git a/kglib/kgcn/learn/classify.py b/kglib/kgcn/learn/classify.py
index ee9fbd85..7dcc0f59 100644
--- a/kglib/kgcn/learn/classify.py
+++ b/kglib/kgcn/learn/classify.py
@@ -16,18 +16,7 @@
 #  specific language governing permissions and limitations
 #  under the License.
 #
-
-#
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing,
-#  software distributed under the License is distributed on an
-#  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-#  KIND, either express or implied.  See the License for the
-#  specific language governing permissions and limitations
-#  under the License.
-#
+import abc
 
 import tensorflow as tf
 import tensorflow.contrib.layers as layers
@@ -36,8 +25,7 @@
 import kglib.kgcn.core.model as model
 
 
-class SupervisedKGCNClassifier:
-
+class SupervisedKGCNMultiClassClassifier(abc.ABC):
     def __init__(
             self,
             kgcn: model.KGCN,
@@ -100,15 +88,17 @@ def __init__(
 
         self._class_scores = regularised_class_scores
 
-        self._labels_winners = tf.argmax(self.labels, -1)
-        self._predictions_class_winners = tf.argmax(self._class_scores, -1)
-        self._confusion_matrix = tf.confusion_matrix(self._labels_winners,
-                                                     self._predictions_class_winners,
-                                                     num_classes=self._num_classes)
-
         self._loss_op = self.loss(class_scores, self.labels)
         self._train_op = self.optimise(self._loss_op)
 
+        self.tf_session = None
+        self.summary_writer = None
+        self.summary = None
+
+        # Subclasses need to initialise various TensorFlow components with:
+        # self._initialise_computation_graph_components()
+
+    def _initialise_computation_graph_components(self):
         ################################################################################################################
         # Graph initialisation tasks - run after the whole graph has been built
         ################################################################################################################
@@ -128,18 +118,27 @@ def __init__(
         self.tf_session.run(init_tables)
         self.summary = tf.summary.merge_all()
 
+    @abc.abstractmethod
+    def _per_class_loss(self, logits, labels):
+        return
+
+    @abc.abstractmethod
+    def _objective_loss(self, per_class_loss):
+        return
+
     def loss(self, logits, labels=None):
 
         with tf.name_scope('loss') as scope:
             # Get the losses from the various layers
-            loss = tf.cast(self._regularisation_weight * tf.losses.get_regularization_loss(), tf.float32)
-            tf.summary.scalar('regularisation_loss', loss)
+            regularisation_loss = tf.cast(self._regularisation_weight * tf.losses.get_regularization_loss(), tf.float32)
+            tf.summary.scalar('regularisation_loss', regularisation_loss)
 
-            # classification loss
-            raw_loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
-            tf.summary.histogram('loss/raw_loss', raw_loss)
-            loss += tf.reduce_mean(raw_loss)
+            per_class_loss = self._per_class_loss(logits, labels)
+            objective_loss = self._objective_loss(per_class_loss)
 
+            loss = objective_loss + regularisation_loss
+
+            tf.summary.histogram('loss/per_class_loss', per_class_loss)
             tf.summary.scalar('loss/final_loss', loss)
 
         return loss
@@ -154,14 +153,16 @@ def optimise(self, loss):
 
         return opt_op
 
+    @abc.abstractmethod
+    def _report_metrics(self, labels, predictions):
+        return
+
     def train(self, feed_dict):
         print("========= Training =========")
         _ = self.tf_session.run(self.dataset_initializer, feed_dict=feed_dict)
         for step in range(self._max_training_steps):
-            _, loss_value, confusion_matrix, class_scores_values, predictions_class_winners_values, \
-                labels_winners_values = self.tf_session.run(
-                    [self._train_op, self._loss_op, self._confusion_matrix, self._class_scores,
-                     self._predictions_class_winners, self._labels_winners])
+            _, loss_value, class_scores, predictions, labels = self.tf_session.run(
+                [self._train_op, self._loss_op, self._class_scores, self._predictions, self._labels_for_testing])
 
             summary_str = self.tf_session.run(self.summary, feed_dict=feed_dict)
             if self._write_summary:
@@ -171,29 +172,29 @@ def train(self, feed_dict):
                 print(f'\n-----')
                 print(f'Step {step}')
                 print(f'Loss: {loss_value:.2f}')
-                metrics.report_multiclass_metrics(labels_winners_values, predictions_class_winners_values)
+
+                self._report_metrics(labels, predictions)
         print("========= Training Complete =========\n\n")
 
     def eval(self, feed_dict):
         print("========= Evaluation =========")
         _ = self.tf_session.run(self.dataset_initializer, feed_dict=feed_dict)
 
-        loss_value, confusion_matrix, class_scores_values, predictions_class_winners_values, labels_winners_values = \
-            self.tf_session.run(
-                [self._loss_op, self._confusion_matrix, self._class_scores, self._predictions_class_winners,
-                 self._labels_winners])
+        loss_value, class_scores, predictions, labels = self.tf_session.run(
+            [self._loss_op, self._class_scores, self._predictions, self._labels_for_testing])
 
         print(f'Loss: {loss_value:.2f}')
-        metrics.report_multiclass_metrics(labels_winners_values, predictions_class_winners_values)
+        self._report_metrics(labels, predictions)
         print("========= Evaluation Complete =========\n\n")
 
     def predict(self, feed_dict):
         print("========= Evaluation =========")
         _ = self.tf_session.run(self.dataset_initializer, feed_dict=feed_dict)
 
-        loss_value, class_scores_values, predictions_class_winners_values = \
-            self.tf_session.run([self._loss_op, self._class_scores, self._predictions_class_winners])
-        print(class_scores_values)
+        loss_value, class_scores, predictions = self.tf_session.run(
+            [self._loss_op, self._class_scores, self._predictions])
+
+        print(class_scores)
         print(f'Loss: {loss_value:.2f}')
         print("========= Evaluation Complete =========\n\n")
 
@@ -202,187 +203,54 @@ def get_feed_dict(self, session, concepts, labels=None):
         # Possibly save/load context arrays here instead
         context_arrays = self._kgcn.input_fn(session, concepts)
 
-        feed_dict = build_feed_dict(self.neighbourhood_placeholders, context_arrays,
-                                    labels_placeholder=self.labels_placeholder, labels=labels)
+        feed_dict = build_feed_dict(self.neighbourhood_placeholders,
+                                    context_arrays,
+                                    labels_placeholder=self.labels_placeholder,
+                                    labels=labels)
         return feed_dict
 
 
-class SupervisedKGCNMultiLabelClassifier:
+class SupervisedKGCNMultiClassSingleLabelClassifier(SupervisedKGCNMultiClassClassifier):
 
-    def __init__(
-            self,
-            kgcn: model.KGCN,
-            optimizer,
-            num_classes,
-            log_dir,
-            max_training_steps=10000,
-            regularisation_weight=0.0,
-            classification_dropout_keep_prob=0.7,
-            use_bias=True,
-            classification_activation=lambda x: x,
-            classification_regularizer=layers.l2_regularizer(scale=0.1),
-            classification_kernel_initializer=tf.contrib.layers.xavier_initializer()):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-        self._log_dir = log_dir
-        self._write_summary = self._log_dir is not None
-        self._kgcn = kgcn
-        self._optimizer = optimizer
-        self._num_classes = num_classes
-        self._max_training_steps = max_training_steps
-        self._regularisation_weight = regularisation_weight
-        self._classification_dropout_keep_prob = classification_dropout_keep_prob
-        self._use_bias = use_bias
-        self._classification_activation = classification_activation
-        self._classification_regularizer = classification_regularizer
-        self._classification_kernel_initializer = classification_kernel_initializer
+        self._labels_for_testing = tf.argmax(self.labels, -1)
+        self._predictions = tf.argmax(self._class_scores, -1)
+        self._initialise_computation_graph_components()
 
-        ################################################################################################################
-        # KGCN Embeddings
-        ################################################################################################################
-        self.labels_placeholder = tf.placeholder(tf.float32, shape=(None, num_classes), name='labels_input')
-        labels_dataset = tf.data.Dataset.from_tensor_slices(self.labels_placeholder)
+    def _per_class_loss(self, logits, labels):
+        return tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels)
 
-        self.embeddings, next_batch, self.dataset_initializer, self.neighbourhood_placeholders = self._kgcn.embed(
-            labels_dataset)
-        self.labels = next_batch[0]
+    def _objective_loss(self, per_class_loss):
+        return tf.reduce_mean(per_class_loss)
 
-        ################################################################################################################
-        # Downstream of embeddings - classification
-        ################################################################################################################
-        classification_layer = tf.layers.Dense(
-            self._num_classes,
-            activation=self._classification_activation,
-            use_bias=self._use_bias,
-            kernel_regularizer=self._classification_regularizer,
-            kernel_initializer=self._classification_kernel_initializer,
-            name='classification_dense_layer')
+    def _report_metrics(self, labels, predictions):
+        return metrics.report_multiclass_metrics(labels, predictions)
 
-        # tf.summary.histogram('classification/dense/kernel', classification_layer.kernel)  # TODO figure out why
-        #  this is throwing an error
-        # tf.summary.histogram('classification/dense/bias', classification_layer.bias)
 
-        class_scores = classification_layer(self.embeddings)
-        tf.summary.histogram('classification/dense/class_scores', class_scores)
+class SupervisedKGCNMultiClassMultiLabelClassifier(SupervisedKGCNMultiClassClassifier):
 
-        regularised_class_scores = tf.nn.dropout(class_scores,
-                                                 self._classification_dropout_keep_prob,
-                                                 name='classification_dropout')
-
-        tf.summary.histogram('evaluate/regularised_class_scores', regularised_class_scores)
-
-        self._class_scores = regularised_class_scores
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
         predictions = tf.cast(self._class_scores, tf.float32)
         threshold = 0.5
-        self._predictions_class_winners = tf.cast(tf.greater(predictions, threshold), tf.int64)
-
-        self._loss_op = self.loss(class_scores, self.labels)
-        self._train_op = self.optimise(self._loss_op)
-
-        ################################################################################################################
-        # Graph initialisation tasks - run after the whole graph has been built
-        ################################################################################################################
-        self.tf_session = tf.Session()
-        # Add the variable initializer Op.
-        init_global = tf.global_variables_initializer()
-        # Added to initialise tf.metrics.recall
-        init_local = tf.local_variables_initializer()
-        init_tables = tf.tables_initializer()
-
-        # Instantiate a SummaryWriter to output summaries and the Graph.
-        if self._write_summary:
-            self.summary_writer = tf.summary.FileWriter(self._log_dir, self.tf_session.graph)
-
-        # Run the Op to initialize the variables.
-        self.tf_session.run(init_global)
-        self.tf_session.run(init_local)
-        self.tf_session.run(init_tables)
-        self.summary = tf.summary.merge_all()
-
-    def loss(self, logits, labels=None):
-
-        with tf.name_scope('loss') as scope:
-            # Get the losses from the various layers
-            loss = tf.cast(self._regularisation_weight * tf.losses.get_regularization_loss(), tf.float32)
-            tf.summary.scalar('regularisation_loss', loss)
-
-            # classification loss
-            raw_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
-            tf.summary.histogram('loss/raw_loss', raw_loss)
-
-            # the loss is summed across classes before it is averaged over
-            # the batch
-            # https://github.com/tensorflow/skflow/issues/113#issuecomment-397631386
-            loss += tf.reduce_mean(tf.reduce_sum(raw_loss, axis=1))
-
-            tf.summary.scalar('loss/final_loss', loss)
-
-        return loss
-
-    def optimise(self, loss):
-        grads_and_vars = self._optimizer.compute_gradients(loss)
+        self._predictions = tf.cast(tf.greater(predictions, threshold), tf.int64)
+        self._labels_for_testing = self.labels
 
-        for grad, var in grads_and_vars:
-            tf.summary.histogram('gradients/' + var.name, grad)
+        self._initialise_computation_graph_components()
 
-        opt_op = self._optimizer.apply_gradients(grads_and_vars), loss
-
-        return opt_op
+    def _per_class_loss(self, logits, labels):
+        return tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels)
 
-    def train(self, feed_dict):
-        print("========= Training =========")
-        _ = self.tf_session.run(self.dataset_initializer, feed_dict=feed_dict)
-        for step in range(self._max_training_steps):
-            _, loss_value, class_scores_values, predictions_class_winners_values, labels_values = self.tf_session.run(
-                [self._train_op, self._loss_op, self._class_scores, self._predictions_class_winners, self.labels])
+    def _objective_loss(self, per_class_loss):
+        # the loss is summed across classes before it is averaged over the batch
+        # https://github.com/tensorflow/skflow/issues/113#issuecomment-397631386
+        return tf.reduce_mean(tf.reduce_sum(per_class_loss, axis=1))
 
-            summary_str = self.tf_session.run(self.summary, feed_dict=feed_dict)
-
-            if self._write_summary:
-                self.summary_writer.add_summary(summary_str, step)
-                self.summary_writer.flush()
-
-            if step % int(self._max_training_steps / 20) == 0:
-
-                print(f'\n-----')
-                print(f'Step {step}')
-                print(f'Loss: {loss_value:.2f}')
-
-                metrics.report_multilabel_metrics(labels_values, predictions_class_winners_values)
-        print("========= Training Complete =========\n\n")
-
-    def eval(self, feed_dict):
-        print("========= Evaluation =========")
-        _ = self.tf_session.run(self.dataset_initializer, feed_dict=feed_dict)
-
-        loss_value, class_scores_values, predictions_class_winners_values, labels_values = self.tf_session.run(
-            [self._loss_op, self._class_scores, self._predictions_class_winners, self.labels])
-
-        print(f'Loss: {loss_value:.2f}')
-        metrics.report_multilabel_metrics(labels_values, predictions_class_winners_values)
-        print("========= Evaluation Complete =========\n\n")
-
-    def predict(self, feed_dict):
-        print("========= Evaluation =========")
-        _ = self.tf_session.run(self.dataset_initializer, feed_dict=feed_dict)
-
-        loss_value, class_scores_values, predictions_class_winners_values = self.tf_session.run(
-            [self._loss_op, self._class_scores, self._predictions_class_winners])
-
-        print(class_scores_values)
-        print(f'Loss: {loss_value:.2f}')
-        print("========= Evaluation Complete =========\n\n")
-
-    def get_feed_dict(self, session, concepts, labels=None):
-
-        # Possibly save/load context arrays here instead
-        context_arrays = self._kgcn.input_fn(session, concepts)
-
-        feed_dict = build_feed_dict(self.neighbourhood_placeholders,
-                                    context_arrays,
-                                    labels_placeholder=self.labels_placeholder,
-                                    labels=labels)
-        return feed_dict
+    def _report_metrics(self, labels, predictions):
+        return metrics.report_multilabel_metrics(labels, predictions)
 
 
 def build_feed_dict(context_array_placeholders, context_array_depths, labels_placeholder=None, labels=None):