chanzuckerberg · nayib-jose-gloria · Mar 8, 2024 · Mar 6, 2024 · Mar 7, 2024 · Mar 7, 2024
diff --git a/api/python/src/cellxgene_ontology_guide/ontology_parser.py b/api/python/src/cellxgene_ontology_guide/ontology_parser.py
@@ -41,8 +41,8 @@ def _parse_ontology_name(self, term_id: str) -> str:
 
     def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[str]:
         """
-        Get the ancestor ontology terms for a given term. If include_self is True, the term itself will be included as an
-         ancestor.
+        Get the ancestor ontology terms for a given term. If include_self is True, the term itself will be included as
+        an ancestor.
 
          Example: get_term_ancestors("CL:0000005") -> ["CL:0000000", ...]
 
@@ -51,13 +51,13 @@ def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[s
         :return: flattened List[str] of ancestor terms
         """
         ontology_name = self._parse_ontology_name(term_id)
-        ancestors: List[str] = self.cxg_schema.ontology(ontology_name)[term_id]["ancestors"]
+        ancestors = list(self.cxg_schema.ontology(ontology_name)[term_id]["ancestors"].keys())
         return ancestors + [term_id] if include_self else ancestors
 
-    def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) -> Dict[str, List[str]]:
+    def get_term_list_ancestors(self, term_ids: List[str], include_self: bool = False) -> Dict[str, List[str]]:
         """
-        Get the ancestor ontology terms for each term in a list. If include_self is True, the term itself will be included
-         as an ancestor.
+        Get the ancestor ontology terms for each term in a list. If include_self is True, the term itself will be
+        included as an ancestor.
 
          Example: get_term_list_ancestors(["CL:0000003", "CL:0000005"], include_self=True) -> {
             "CL:0000003": ["CL:0000003"],
@@ -71,10 +71,102 @@ def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) ->
         """
         return {term_id: self.get_term_ancestors(term_id, include_self) for term_id in term_ids}
 
+    def map_high_level_terms(
+        self, term_ids: List[str], high_level_terms: List[str], include_self: bool = True
+    ) -> Dict[str, List[str]]:
+        """
+        Given a list of ontology term IDs and a list of high_level_terms to map them to, returns a dictionary with
+        format
+
+        {"CL:0000003": ["CL:0000000", ...], "CL:0000005": ["CL:0000000", ...]}
+
+        Where each term_id is mapped to a List[str] of high-level terms that it is a descendant of
+
+        :param term_ids: list of str ontology terms to map high level terms for
+        :param high_level_terms: list of str ontology terms that can be mapped to descendant term_ids
+        :param include_self: bool to map a term_id to itself if it is in high_level_terms
+        :return: Dictionary mapping str term IDs to their respective List[str] of ancestor terms from the input list.
+        Each key maps to empty list if there are no ancestors among the provided input.
+        """
+        ancestors = self.get_term_list_ancestors(term_ids, include_self)
+        for term_id in term_ids:
+            ancestors[term_id] = [
+                high_level_term for high_level_term in ancestors[term_id] if high_level_term in high_level_terms
+            ]
+        return ancestors
+
+    def get_distance_between_terms(self, ontology: Ontology, term_id_1: str, term_id_2: str) -> int:
+        """
+        Get the distance between two ontology terms. The distance is defined as the number of edges between the
+        two terms. Terms must be from the same ontology. Returns -1 if terms are disjoint.
+
+        :param ontology: Ontology enum of the ontology to find distance for
+        :param term_id_1: str ontology term to find distance for
+        :param term_id_2: str ontology term to find distance for
+        :return: int distance between the two terms, measured in number of edges between their shortest path.
+        """
+        lcas = self.get_lowest_common_ancestors(ontology, term_id_1, term_id_2)
+        if not lcas:
+            return -1
+        return int(
+            self.cxg_schema.ontology(ontology.name)[term_id_1]["ancestors"][lcas[0]]
+            + self.cxg_schema.ontology(ontology.name)[term_id_2]["ancestors"][lcas[0]]
+        )
+
+    def get_lowest_common_ancestors(self, ontology: Ontology, term_id_1: str, term_id_2: str) -> List[str]:
+        """
+        Get the lowest common ancestors between two ontology terms that is from the given ontology.
+        Terms must be from the same ontology. Ontologies are DAGs, so there may be multiple lowest common ancestors.
+
+        :param ontology: Ontology enum of the ontology to find distance for
+        :param term_id_1: str ontology term to find LCA for
+        :param term_id_2: str ontology term to find LCA for
+        :return: str term ID of the lowest common ancestor term
+        """
+        # include path to term itself
+        ancestors_1 = self.cxg_schema.ontology(ontology.name)[term_id_1]["ancestors"] + {term_id_1: 0}
+        ancestors_2 = self.cxg_schema.ontology(ontology.name)[term_id_2]["ancestors"] + {term_id_2: 0}
+        common_ancestors = set(ancestors_1.keys()) & set(ancestors_2.keys())
+        min_sum_distances = min(common_ancestors, key=lambda x: ancestors_1[x] + ancestors_2[x])
+        return [term for term in common_ancestors if ancestors_1[term] + ancestors_2[term] == min_sum_distances]
+
+    def map_highest_level_term(
+        self, term_ids: List[str], high_level_terms: List[str], include_self: bool = True
+    ) -> Dict[str, Union[str, None]]:
+        """
+        Given a list of ontology term IDs and a list of high_level_terms to map them to, returns a dictionary with
+        format
+
+        {"CL:0000003": "CL:0000000", "CL:0000005": "CL:0000000"}
+
+        Where each term_id is mapped to the highest level term that it is a descendant of, from the list provided. Maps
+        to None if term_id does not map to any high level terms among the provided input.
+
+        :param term_ids: list of str ontology terms to map high level terms for
+        :param high_level_terms: list of str ontology terms that can be mapped to descendant term_ids
+        :param include_self: bool to map a term_id to itself if it is in high_level_terms
+        :return: Dictionary mapping str term IDs to their respective List[str] of ancestor terms from the input list.
+        Each key maps to empty list if there are no ancestors among the provided input.
+        """
+        high_level_term_map = self.map_high_level_terms(term_ids, high_level_terms, include_self)
+        highest_level_term_map = dict()
+        for term_id in term_ids:
+            ontology = self._parse_ontology_name(term_id)
+            # map term_id to the high_level_term with the longest distance from term_id
+            highest_level_term_map[term_id] = (
+                max(
+                    high_level_term_map[term_id],
+                    key=lambda x: self.cxg_schema.ontology(ontology)[term_id]["ancestors"][x],
+                )
+                if high_level_term_map[term_id]
+                else None
+            )
+        return highest_level_term_map
+
     def get_terms_descendants(self, term_ids: List[str], include_self: bool = False) -> Dict[str, List[str]]:
         """
-        Get the descendant ontology terms for each term in a list. If include_self is True, the term itself will be included
-         as a descendant.
+        Get the descendant ontology terms for each term in a list. If include_self is True, the term itself will be
+         included as a descendant.
 
         Example: get_terms_descendants(["CL:0000003", "CL:0000005"], include_self=True) -> {
             "CL:0000003": ["CL:0000003", "CL:0000004", ...],
@@ -83,8 +175,8 @@ def get_terms_descendants(self, term_ids: List[str], include_self: bool = False)
 
         :param term_ids: list of str ontology terms to find descendants for
         :param include_self: boolean flag to include the term itself as an descendant
-        :return: Dictionary mapping str term IDs to their respective flattened List[str] of descendant terms. Maps to empty
-        list if there are no descendants.
+        :return: Dictionary mapping str term IDs to their respective flattened List[str] of descendant terms. Maps to
+        empty list if there are no descendants.
         """
         descendants_dict = dict()
         ontology_names = set()
@@ -96,7 +188,8 @@ def get_terms_descendants(self, term_ids: List[str], include_self: bool = False)
         for ontology in ontology_names:
             for candidate_descendant, candidate_metadata in self.cxg_schema.ontology(ontology).items():
                 for ancestor_id in descendants_dict:
-                    if ancestor_id in candidate_metadata["ancestors"]:
+                    ancestors = candidate_metadata["ancestors"].keys()
+                    if ancestor_id in ancestors:
                         descendants_dict[ancestor_id].append(candidate_descendant)
 
         return descendants_dict

diff --git a/api/python/tests/test_ontology_parser.py b/api/python/tests/test_ontology_parser.py
@@ -9,23 +9,23 @@
 @pytest.fixture
 def ontology_dict():
     return {
-        "CL:0000000": {"ancestors": [], "label": "cell A", "deprecated": False},
+        "CL:0000000": {"ancestors": {}, "label": "cell A", "deprecated": False},
         "CL:0000001": {
-            "ancestors": ["CL:0000000"],
+            "ancestors": {"CL:0000000": 1},
             "label": "cell B",
             "deprecated": False,
             "consider": ["CL:0000004"],
         },
-        "CL:0000002": {"ancestors": ["CL:0000000"], "label": "cell C", "deprecated": False},
+        "CL:0000002": {"ancestors": {"CL:0000000": 1}, "label": "cell C", "deprecated": False},
         "CL:0000003": {
-            "ancestors": ["CL:0000000"],
+            "ancestors": {"CL:0000000": 1},
             "label": "obsolete cell",
             "deprecated": True,
             "replaced_by": "CL:0000004",
             "comments": ["this term was deprecated in favor of a descendant term of CL:0000001"],
             "term_tracker": "http://example.com/issue/1234",
         },
-        "CL:0000004": {"ancestors": ["CL:0000001", "CL:0000000"], "label": "cell B2", "deprecated": False},
+        "CL:0000004": {"ancestors": {"CL:0000000": 1, "CL:0000001": 2}, "label": "cell B2", "deprecated": False},
     }
 
 
@@ -62,22 +62,22 @@ def test_parse_ontology_name__not_supported(ontology_parser):
 
 
 def test_get_term_ancestors(ontology_parser):
-    assert ontology_parser.get_term_ancestors("CL:0000004") == ["CL:0000001", "CL:0000000"]
+    assert ontology_parser.get_term_ancestors("CL:0000004") == ["CL:0000000", "CL:0000001"]
     assert ontology_parser.get_term_ancestors("CL:0000004", include_self=True) == [
-        "CL:0000001",
         "CL:0000000",
+        "CL:0000001",
         "CL:0000004",
     ]
 
 
 def test_get_term_list_ancestors(ontology_parser):
     assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"]) == {
         "CL:0000000": [],
-        "CL:0000004": ["CL:0000001", "CL:0000000"],
+        "CL:0000004": ["CL:0000000", "CL:0000001"],
     }
     assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"], include_self=True) == {
         "CL:0000000": ["CL:0000000"],
-        "CL:0000004": ["CL:0000001", "CL:0000000", "CL:0000004"],
+        "CL:0000004": ["CL:0000000", "CL:0000001", "CL:0000004"],
     }
 
 

diff --git a/artifact-schemas/all_ontology_schema.json b/artifact-schemas/all_ontology_schema.json
@@ -1,10 +1,10 @@
 {
-  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$schema": "http://json-schema.org/draft-07,/schema#",
   "title": "Valid Ontology Term JSON Schema",
   "description": "Schema for file containing metadata for Ontology Terms accepted in dataset submissions to CZ CellXGene Data Portal.",
   "type": "object",
   "patternProperties": {
-    "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$": {
+    "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7,}$": {
       "type": "object",
       "properties": {
         "label": {
@@ -16,13 +16,13 @@
           "description": "Indicates whether the ontology entry is deprecated."
         },
         "ancestors": {
-          "type": "array",
-          "items": {
-            "type": "string",
-            "pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$",
-            "description": "List of ancestor IDs for the ontology entry."
-          },
-          "description": "An array of ancestor ontology terms that this term is a subclass of."
+          "type": "object",
+          "description": "A map of ancestor ontology terms that this term is a subclass of, keyed to the distance from the term.",
+          "patternProperties": {
+            "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7,}$": {
+              "type": "integer"
+            }
+          }
         },
         "comments": {
           "type": "array",
@@ -47,7 +47,7 @@
         },
         "replaced_by": {
           "type": "string",
-          "pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7}$",
+          "pattern": "^(EFO|UBERON|CL|HANCESTRO|HsapDv|MmusDv|PATO|NCBITaxon|MONDO):[0-9]{7,}$",
           "description": "If deprecated, the ID of the ontology entry that should canonically replace this one."
         }
       },

diff --git a/artifact-schemas/cell_class_list_schema.json b/artifact-schemas/cell_class_list_schema.json
@@ -5,7 +5,7 @@
   "type": "array",
   "items": {
     "type": "string",
-    "pattern": "^CL:[0-9]{7}$"
+    "pattern": "^CL:[0-9]{7,}$"
   },
   "minItems": 1,
   "uniqueItems": true

diff --git a/artifact-schemas/cell_subclass_list_schema.json b/artifact-schemas/cell_subclass_list_schema.json
@@ -5,7 +5,7 @@
   "type": "array",
   "items": {
     "type": "string",
-    "pattern": "^CL:[0-9]{7}$"
+    "pattern": "^CL:[0-9]{7,}$"
   },
   "minItems": 1,
   "uniqueItems": true

diff --git a/artifact-schemas/cell_type_descendants_schema.json b/artifact-schemas/cell_type_descendants_schema.json
@@ -4,11 +4,11 @@
   "description": "A schema for mapping cell type ontology Terms to its descendant cell type ontology terms",
   "type": "object",
   "patternProperties": {
-    "^CL:[0-9]{7}$": {
+    "^CL:[0-9]{7,}$": {
       "type": "array",
       "items": {
         "type": "string",
-        "pattern": "^CL:[0-9]{7}$"
+        "pattern": "^CL:[0-9]{7,}$"
       },
       "description": "An array of CL ontology term IDs that are subclasses of the key CL ontology term ID."
     }

diff --git a/artifact-schemas/organ_list_schema.json b/artifact-schemas/organ_list_schema.json
@@ -5,7 +5,7 @@
   "type": "array",
   "items": {
     "type": "string",
-    "pattern": "^UBERON:[0-9]{7}$"
+    "pattern": "^UBERON:[0-9]{7,}$"
   },
   "minItems": 1,
   "uniqueItems": true

diff --git a/artifact-schemas/system_list_schema.json b/artifact-schemas/system_list_schema.json
@@ -5,7 +5,7 @@
   "type": "array",
   "items": {
     "type": "string",
-    "pattern": "^UBERON:[0-9]{7}$"
+    "pattern": "^UBERON:[0-9]{7,}$"
   },
   "minItems": 1,
   "uniqueItems": true

diff --git a/artifact-schemas/tissue_descendants_schema.json b/artifact-schemas/tissue_descendants_schema.json
@@ -4,11 +4,11 @@
   "description": "A schema for mapping Tissue ontology terms to its descendant Tissue ontology terms",
   "type": "object",
   "patternProperties": {
-    "^UBERON:[0-9]{7}$": {
+    "^UBERON:[0-9]{7,}$": {
       "type": "array",
       "items": {
         "type": "string",
-        "pattern": "^UBERON:[0-9]{7}$"
+        "pattern": "^UBERON:[0-9]{7,}$"
       },
       "description": "An array of UBERON ontology term IDs that are subclasses of the key UBERON ontology term ID."
     }

diff --git a/artifact-schemas/tissue_general_list_schema.json b/artifact-schemas/tissue_general_list_schema.json
@@ -5,7 +5,7 @@
   "type": "array",
   "items": {
     "type": "string",
-    "pattern": "^UBERON:[0-9]{7}$"
+    "pattern": "^UBERON:[0-9]{7,}$"
   },
   "minItems": 1,
   "uniqueItems": true

diff --git a/ontology-assets/CL-ontology-v2024-01-04.json.gz b/ontology-assets/CL-ontology-v2024-01-04.json.gz
diff --git a/ontology-assets/EFO-ontology-v3.62.0.json.gz b/ontology-assets/EFO-ontology-v3.62.0.json.gz
diff --git a/ontology-assets/HANCESTRO-ontology-3.0.json.gz b/ontology-assets/HANCESTRO-ontology-3.0.json.gz
diff --git a/ontology-assets/HsapDv-ontology-11.json.gz b/ontology-assets/HsapDv-ontology-11.json.gz
diff --git a/ontology-assets/MONDO-ontology-v2024-01-03.json.gz b/ontology-assets/MONDO-ontology-v2024-01-03.json.gz
diff --git a/ontology-assets/MmusDv-ontology-9.json.gz b/ontology-assets/MmusDv-ontology-9.json.gz
diff --git a/ontology-assets/NCBITaxon-ontology-v2023-06-20.json.gz b/ontology-assets/NCBITaxon-ontology-v2023-06-20.json.gz
diff --git a/ontology-assets/PATO-ontology-v2023-05-18.json.gz b/ontology-assets/PATO-ontology-v2023-05-18.json.gz
diff --git a/ontology-assets/UBERON-ontology-v2024-01-18.json.gz b/ontology-assets/UBERON-ontology-v2024-01-18.json.gz