bio-ontology-research-group · ferzcam · Jul 30, 2024 · Jun 11, 2024 · Jun 12, 2024 · Jul 16, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix bug in GCI2 score for ELEmbeddings
 - Fix bottleneck in ELBE example for PPI.
 - Fix bugs in BoxSquaredEL model.
+- Fix bug on OWL2VecStarProjector about projecting with literals
 
 ### Security
 

diff --git a/docs/source/examples/elmodels/plot_1_elembeddings.ipynb b/docs/source/examples/elmodels/plot_1_elembeddings.ipynb
@@ -1,16 +1,5 @@
 {
   "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
@@ -47,7 +36,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## EL-Embeddings (PyTorch) module.\n\nEL-Embeddings defines a geometric modelling for all the GCIs in the EL language.\nThe implementation of ELEmbeddings module can be found at :class:`mowl.nn.el.elem.module.ELEmModule`.\n\n## EL-Embeddings model\n\nThe module :class:`mowl.nn.el.elem.module.ELEmModule` is used in the :class:`mowl.models.elembeddings.model.ELEmbeddings`.\nIn the use case of this example, we will test over a biological problem, which is\nprotein-protein interactions. Given two proteins $p_1,p_2$, the phenomenon\n\"$p_1$ interacts with $p_2$\" is encoded using GCI 2 as:\n\n\\begin{align}p_1 \\sqsubseteq interacts\\_with. p_2\\end{align}\n\nFor that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELEmPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.\n\n"
+        "## EL-Embeddings (PyTorch) module.\n\nEL-Embeddings defines a geometric modelling for all the GCIs in the EL language.\nThe implementation of ELEmbeddings module can be found at :class:`mowl.nn.el.elem.module.ELEmModule`.\n\n## EL-Embeddings model\n\nThe module :class:`mowl.nn.el.elem.module.ELEmModule` is used in the :class:`mowl.models.elembeddings.model.ELEmbeddings`.\nIn the use case of this example, we will test over a biological problem, which is\nprotein-protein interactions. Given two proteins $p_1,p_2$, the phenomenon\n\"$p_1$ interacts with $p_2$\" is encoded using GCI 2 as:\n\n\\begin{align}p_1 \\sqsubseteq \\exists interacts\\_with. p_2\\end{align}\n\nFor that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELEmPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.\n\n"
       ]
     },
     {
@@ -103,7 +92,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.8.16"
+      "version": "3.8.19"
     }
   },
   "nbformat": 4,

diff --git a/docs/source/examples/elmodels/plot_1_elembeddings.py b/docs/source/examples/elmodels/plot_1_elembeddings.py
@@ -55,7 +55,7 @@
 # ":math:`p_1` interacts with :math:`p_2`" is encoded using GCI 2 as:
 #
 # .. math::
-#    p_1 \sqsubseteq interacts\_with. p_2
+#    p_1 \sqsubseteq \exists interacts\_with. p_2
 #
 # For that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELEmPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.
 

diff --git a/docs/source/examples/elmodels/plot_1_elembeddings.rst b/docs/source/examples/elmodels/plot_1_elembeddings.rst
diff --git a/docs/source/examples/elmodels/plot_2_elboxembeddings.ipynb b/docs/source/examples/elmodels/plot_2_elboxembeddings.ipynb
@@ -1,16 +1,5 @@
 {
   "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
     {
       "cell_type": "markdown",
       "metadata": {},
@@ -47,7 +36,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## ELBoxEmbeddings model\n\nThe module :class:`mowl.nn.el.elem.module.ELBoxModule` is used in the :class:`mowl.models.elboxembeddings.model.ELBoxEmbeddings`.\nIn the use case of this example, we will test over a biological problem, which is\nprotein-protein interactions. Given two proteins $p_1,p_2$, the phenomenon\n\"$p_1$ interacts with $p_2$\" is encoded using GCI 2 as:\n\n\\begin{align}p_1 \\sqsubseteq interacts\\_with. p_2\\end{align}\n\nFor that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELBoxPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.\n\n"
+        "## ELBoxEmbeddings model\n\nThe module :class:`mowl.nn.el.elem.module.ELBoxModule` is used in the :class:`mowl.models.elboxembeddings.model.ELBoxEmbeddings`.\nIn the use case of this example, we will test over a biological problem, which is\nprotein-protein interactions. Given two proteins $p_1,p_2$, the phenomenon\n\"$p_1$ interacts with $p_2$\" is encoded using GCI 2 as:\n\n\\begin{align}p_1 \\sqsubseteq \\exists interacts\\_with. p_2\\end{align}\n\nFor that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELBoxPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.\n\n"
       ]
     },
     {
@@ -103,7 +92,7 @@
       "name": "python",
       "nbconvert_exporter": "python",
       "pygments_lexer": "ipython3",
-      "version": "3.8.16"
+      "version": "3.8.19"
     }
   },
   "nbformat": 4,

diff --git a/docs/source/examples/elmodels/plot_2_elboxembeddings.py b/docs/source/examples/elmodels/plot_2_elboxembeddings.py
@@ -50,7 +50,7 @@
 # ":math:`p_1` interacts with :math:`p_2`" is encoded using GCI 2 as:
 #
 # .. math::
-#    p_1 \sqsubseteq interacts\_with. p_2
+#    p_1 \sqsubseteq \exists interacts\_with. p_2
 #
 # For that, we can use the class :class:`mowl.models.elembeddings.examples.model_ppi.ELBoxPPI` mode, which uses the :class:`mowl.datasets.builtin.PPIYeastSlimDataset` dataset.
 

diff --git a/docs/source/examples/elmodels/plot_2_elboxembeddings.rst b/docs/source/examples/elmodels/plot_2_elboxembeddings.rst
diff --git a/experiments/evaluators.py b/experiments/evaluators.py
@@ -56,6 +56,17 @@ def create_tuples(self, ontology):
     def get_logits(self, batch):
         raise NotImplementedError
 
+    def evaluate(self, *args, **kwargs):
+        model = args[0]
+        mode = kwargs.get("mode")
+
+        if mode == "valid":
+            eval_tuples = self.valid_tuples
+        else:
+            eval_tuples = self.test_tuples
+
+        return self.evaluate_base(model, eval_tuples, **kwargs)
+
 
     def evaluate_base(self, model, eval_tuples, mode="test", **kwargs):
         num_heads, num_tails = len(self.evaluation_heads), len(self.evaluation_tails)
@@ -70,8 +81,8 @@ def evaluate_base(self, model, eval_tuples, mode="test", **kwargs):
             mask = mask1 | mask2
             deductive_closure_tuples = self.deductive_closure_tuples[~mask]
 
-            # eval_tuples = th.cat([eval_tuples, deductive_closure_tuples], dim=0)
-            eval_tuples = deductive_closure_tuples
+            eval_tuples = th.cat([eval_tuples, deductive_closure_tuples], dim=0)
+            # eval_tuples = deductive_closure_tuples
         dataloader = FastTensorDataLoader(eval_tuples, batch_size=self.batch_size, shuffle=False)
 
         metrics = dict()
@@ -224,16 +235,6 @@ def evaluate_base(self, model, eval_tuples, mode="test", **kwargs):
             return metrics
 
 
-    def evaluate(self, *args, **kwargs):
-        model = args[0]
-        mode = kwargs.get("mode")
-
-        if mode == "valid":
-            eval_tuples = self.valid_tuples
-        else:
-            eval_tuples = self.test_tuples
-
-        return self.evaluate_base(model, eval_tuples, **kwargs)
 
 
 class SubsumptionEvaluator(Evaluator):

diff --git a/gateway/src/main/scala/org/mowl/Projectors/OWL2VecStarProjector.scala b/gateway/src/main/scala/org/mowl/Projectors/OWL2VecStarProjector.scala
@@ -45,7 +45,7 @@ class OWL2VecStarProjector(
     var objectPropertyAssertionAxiom = ListBuffer[OWLObjectPropertyAssertionAxiom]()
     var subclassOfAxioms = ListBuffer[OWLSubClassOfAxiom]()
     var equivalenceAxioms = ListBuffer[OWLEquivalentClassesAxiom]()
-    var annotationAxioms = ListBuffer[OWLAnnotationAssertionAxiom]()
+    var annotationAxioms = ListBuffer[(OWLAnnotation, OWLClass)]()
     var domainAxioms = ListBuffer[OWLObjectPropertyDomainAxiom]()
     var rangeAxioms = ListBuffer[OWLObjectPropertyRangeAxiom]()
     var otherAxioms = ListBuffer[OWLAxiom]()
@@ -98,10 +98,6 @@ class OWL2VecStarProjector(
 
       axiom.getAxiomType.getName match {
         case "SubClassOf" => subclassOfAxioms += axiom.asInstanceOf[OWLSubClassOfAxiom]
-        case "AnnotationAssertion" => {
-          if (include_literals)
-          annotationAxioms += axiom.asInstanceOf[OWLAnnotationAssertionAxiom]
-        }
         case "EquivalentClasses" => equivalenceAxioms += axiom.asInstanceOf[OWLEquivalentClassesAxiom]
         case "ClassAssertion" => classAssertionAxiom += axiom.asInstanceOf[OWLClassAssertionAxiom]
         case "ObjectPropertyAssertion" => objectPropertyAssertionAxiom += axiom.asInstanceOf[OWLObjectPropertyAssertionAxiom]
@@ -114,6 +110,23 @@ class OWL2VecStarProjector(
       }
     }
 
+
+    if (include_literals) {
+      for (owl_class <- ontology.getClassesInSignature(imports).asScala.toList){
+        val annotations = EntitySearcher.getAnnotations(owl_class, ontology).asScala.toList
+        for (annotation <- annotations){
+          annotationAxioms += ((annotation, owl_class))
+        }
+      }
+    }
+      // val annotAxioms = ontology.getAnnotations.asScala.toList
+      // println("Annotations: ", annotAxioms.size)
+      // for (axiom <- annotAxioms)
+        // annotationAxioms += axiom.asInstanceOf[OWLAnnotationAssertionAxiom]
+    // }
+
+
+
     val subclassOfTriples = subclassOfAxioms.flatMap(x => processSubClassAxiom(x.getSubClass, x.getSuperClass, ontology))
     val equivalenceTriples = equivalenceAxioms.flatMap(
       x => {
@@ -126,10 +139,18 @@ class OWL2VecStarProjector(
         }
       }
     )
-    val annotationTriples = annotationAxioms.map(processAnnotationAxiom(_)).flatten
+    val annotationTriples = annotationAxioms.map{ case (x,y) => processAnnotationAxiom(x,y)}.flatten
     val classAssertionTriples = classAssertionAxiom.map(processClassAssertionAxiom(_)).flatten
     val objectPropertyAssertionTriples = objectPropertyAssertionAxiom.map(processObjectPropertyAssertionAxiom(_, ontology)).flatten
     val domainAndRangeTriples = processDomainAndRangeAxioms(domainAxioms, rangeAxioms, ontology)
+
+    println("OWL2VecStar projection done!")
+    println("\tSubclassOf triples: " + subclassOfTriples.size)
+    println("\tEquivalence triples: " + equivalenceTriples.size)
+    println("\tClass assertion triples: " + classAssertionTriples.size)
+    println("\tObject property assertion triples: " + objectPropertyAssertionTriples.size)
+    println("\tDomain and range triples: " + domainAndRangeTriples.size)
+    println("\tAnnotation triples: " + annotationTriples.size)
     (subclassOfTriples.toList ::: equivalenceTriples.toList ::: annotationTriples.toList ::: classAssertionTriples.toList ::: objectPropertyAssertionTriples.toList ::: domainAndRangeTriples.toList).distinct.asJava
   }
 
@@ -261,12 +282,12 @@ class OWL2VecStarProjector(
 
 
 
-  def processAnnotationAxiom(axiom: OWLAnnotationAssertionAxiom): Option[Triple]= {
+  def processAnnotationAxiom(axiom: OWLAnnotation, owl_class: OWLClass): Option[Triple]= {
     val property = stripValue(axiom.getProperty.toString)
 
     property match {
       case m if (lexicalAnnotationURIs contains m) => {
-        val subject = axiom.getSubject.toString
+        // val subject = axiom.getSubject.toString
         val value = axiom.getValue
 
         val valueStr = value.isLiteral match {
@@ -282,7 +303,7 @@ class OWL2VecStarProjector(
           }
           case false => stripValue(axiom.getValue.toString)
         }
-        Some(new Triple(subject, m, valueStr))
+        Some(new Triple(owl_class, m, valueStr))
       }
       case _ => {
         //println("C ",property)

diff --git a/mowl/base_models/elmodel.py b/mowl/base_models/elmodel.py
@@ -5,7 +5,7 @@
 import torch as th
 from torch.utils.data import DataLoader, default_collate
 
-from deprecated.sphinx import versionadded
+from deprecated.sphinx import versionadded, versionchanged
 
 from org.semanticweb.owlapi.model import OWLClassExpression, OWLClass, OWLObjectSomeValuesFrom, OWLObjectIntersectionOf
 
@@ -14,17 +14,28 @@
 import mowl.error.messages as msg
 import os
 
+@versionchanged(version="1.0.0", reason="Added the 'load_normalized' parameter.")
 class EmbeddingELModel(Model):
     """Abstract class for :math:`\mathcal{EL}` embedding methods.
 
+    :param dataset: mOWL dataset to use for training and evaluation.
+    :type dataset: :class:`mowl.datasets.Dataset`
+    :param embed_dim: The embedding dimension.
+    :type embed_dim: int
+    :param batch_size: The batch size to use for training.
+    :type batch_size: int
     :param extended: If `True`, the model is supposed with 7 EL normal forms. This will be \
 reflected on the :class:`DataLoaders` that will be generated and also the model must \
     contain 7 loss functions. If `False`, the model will work with 4 normal forms only, \
 merging the 3 extra to their corresponding origin normal forms. Defaults to True
     :type extended: bool, optional
+    :param load_normalized: If `True`, the ontology is assumed to be normalized and GCIs are extracted directly. Defaults to False.
+    :type load_normalized: bool, optional
+    :param device: The device to use for training. Defaults to "cpu".
+    :type device: str, optional
     """
 
-    def __init__(self, dataset, embed_dim, batch_size, extended=True, model_filepath=None, device="cpu"):
+    def __init__(self, dataset, embed_dim, batch_size, extended=True, model_filepath=None, load_normalized=False, device="cpu"):
         super().__init__(dataset, model_filepath=model_filepath)
 
         if not isinstance(embed_dim, int):
@@ -36,6 +47,9 @@ def __init__(self, dataset, embed_dim, batch_size, extended=True, model_filepath
         if not isinstance(extended, bool):
             raise TypeError("Optional parameter extended must be of type bool.")
 
+        if not isinstance(load_normalized, bool):
+            raise TypeError("Optional parameter load_normalized must be of type bool.")
+
         if not isinstance(device, str):
             raise TypeError("Optional parameter device must be of type str.")
 
@@ -45,7 +59,8 @@ def __init__(self, dataset, embed_dim, batch_size, extended=True, model_filepath
         self.embed_dim = embed_dim
         self.batch_size = batch_size
         self.device = device
-
+        self.load_normalized = load_normalized
+
         self._training_datasets = None
         self._validation_datasets = None
         self._testing_datasets = None
@@ -62,9 +77,12 @@ def _load_datasets(self):
         if self._datasets_loaded:
             return
 
-        training_el_dataset = ELDataset(self.dataset.ontology, self.class_index_dict,
+        training_el_dataset = ELDataset(self.dataset.ontology,
+                                        self.class_index_dict,
                                         self.object_property_index_dict,
-                                        extended=self._extended, device=self.device)
+                                        extended=self._extended,
+                                        load_normalized = self.load_normalized,
+                                        device=self.device)
 
         self._training_datasets = training_el_dataset.get_gci_datasets()