Skip to content

Commit

Permalink
feat: Add API ontology querying module (#39)
Browse files Browse the repository at this point in the history
## Reason for Change

- #14

## Changes

- implement all_ontology querying functions to fetch metadata, for
single term IDs and bulk IDs
- reorganize file structure to store artifacts that will be packaged
with the API in the python/api directory, rather than in the
ontology-builder directory (which will not be packaged).
- In turn, reorganize the env.py constants + GHAs pointing to
directories to read from / output generated files to.

## Testing steps
- unit tests 

## Notes for Reviewer
  • Loading branch information
Bento007 authored Feb 23, 2024
2 parents bd6d842 + 2c88a50 commit 239ef2b
Show file tree
Hide file tree
Showing 11 changed files with 336 additions and 5 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/generate_all_ontology.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Updates to Ontology Files
on:
push:
paths:
- "**/tools/ontology-builder/src/ontology-references/ontology_info.yml"
- "**/api/python/src/cellxgene_ontology_guide/artifacts/ontology_info.yml"
branches-ignore:
- main

Expand Down Expand Up @@ -36,7 +36,7 @@ jobs:
- name: ontology-processing
run: |
python3 ./tools/ontology-builder/src/all_ontology_generator.py
git add ./tools/ontology-builder/src/ontology-references/all_ontology.json.gz
git add ./api/python/src/cellxgene_ontology_guide/artifacts/all_ontology.json.gz
- name: Commit
run: |
git commit -m "AUTO: update ontologies"
Expand Down
Empty file removed api/python/__init__.py
Empty file.
2 changes: 1 addition & 1 deletion api/python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ license = { text = "MIT" }
readme = "README.md"
requires-python = "~= 3.11"
dependencies = [
"owlready2"
"PyYAML"
]

[project.optional-dependencies]
Expand Down
19 changes: 19 additions & 0 deletions api/python/src/cellxgene_ontology_guide/artifact_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os

from constants import ARTIFACT_DIR, CURRENT_SCHEMA_VERSION


def load_artifact_by_schema(schema_version: str, filename: str) -> str:
"""
Load ontology files from GitHub Release Assets, based on the provided schema version.
Returns ValueError if the schema version is not supported in this package version.
:param schema_version: str version of the schema to load ontology files for
:param filename: str name of the file to load
:return: str path to the ontology file
"""
if schema_version == CURRENT_SCHEMA_VERSION:
return os.path.join(ARTIFACT_DIR, filename)
else:
# TODO: Add support for loading ontology files from different schema versions
raise ValueError(f"Schema version {schema_version} is not supported in this package version.")
Binary file not shown.
7 changes: 7 additions & 0 deletions api/python/src/cellxgene_ontology_guide/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
import os

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
ARTIFACT_DIR = os.path.join(PACKAGE_ROOT, "artifacts")
ALL_ONTOLOGY_FILENAME = "all_ontology.json.gz"
ONTOLOGY_INFO_FILENAME = "ontology_info.yml"
CURRENT_SCHEMA_VERSION = "5.0.0"
172 changes: 172 additions & 0 deletions api/python/src/cellxgene_ontology_guide/ontology_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import gzip
import json
import re
from typing import Any, Dict, List, Union

import yaml
from artifact_download import load_artifact_by_schema
from constants import ALL_ONTOLOGY_FILENAME, CURRENT_SCHEMA_VERSION, ONTOLOGY_INFO_FILENAME


class OntologyParser:
"""
An object to parse ontology term metadata from ontologies corresponding to a given CellxGene Schema Version.
"""

def __init__(self, schema_version: str = CURRENT_SCHEMA_VERSION):
"""
Initialize an OntologyParser object with the ontology metadata corresponding to the given CellxGene schema
version. By default, loads the ontology metadata for the latest compatible schema version from disk. If a
different schema version is set, the corresponding ontology metadata will be loaded instead. If not available
from disk, it will make a network call to GitHub Release Assets.
:param schema_version: str version of the schema to load ontology metadata for
"""
all_ontology_filepath = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME)
ontology_info_filepath = load_artifact_by_schema(schema_version, ONTOLOGY_INFO_FILENAME)

with gzip.open(all_ontology_filepath, "rt") as f:
self.ontology_dict = json.load(f)

with open(ontology_info_filepath, "rt") as f:
self.supported_ontologies = yaml.safe_load(f)

def _parse_ontology_name(self, term_id: str) -> str:
"""
Parse the ontology name from a given term ID. If the term ID does not conform to the expected term format or is not
from an ontology supported by cellxgene-ontology-guide, raise a ValueError.
:param term_id: str ontology term to parse
:return: str name of ontology that term belongs to
"""
pattern = r"[A-Za-z]+:\d+"
if not re.match(pattern, term_id):
raise ValueError(f"{term_id} does not conform to expected regex pattern {pattern} and cannot be queried.")

ontology_name = term_id.split(":")[0]
if ontology_name not in self.supported_ontologies:
raise ValueError(f"{term_id} is not part of a supported ontology, its metadata cannot be fetched.")

return ontology_name

def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[str]:
"""
Get the ancestor ontology terms for a given term. If include_self is True, the term itself will be included as an
ancestor.
Example: get_term_ancestors("CL:0000005") -> ["CL:0000000", ...]
:param term_id: str ontology term to find ancestors for
:param include_self: boolean flag to include the term itself as an ancestor
:return: flattened List[str] of ancestor terms
"""
ontology_name = self._parse_ontology_name(term_id)
ancestors: List[str] = self.ontology_dict[ontology_name][term_id]["ancestors"]
return ancestors + [term_id] if include_self else ancestors

def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) -> Dict[str, List[str]]:
"""
Get the ancestor ontology terms for each term in a list. If include_self is True, the term itself will be included
as an ancestor.
Example: get_term_list_ancestors(["CL:0000003", "CL:0000005"], include_self=True) -> {
"CL:0000003": ["CL:0000003"],
"CL:0000005": ["CL:0000005", "CL:0000000", ...]
}
:param term_ids: list of str ontology terms to find ancestors for
:param include_self: boolean flag to include the term itself as an ancestor
:return: Dictionary mapping str term IDs to their respective flattened List[str] of ancestor terms. Maps to empty
list if there are no ancestors.
"""
return {term_id: self.get_term_ancestors(term_id, include_self) for term_id in term_ids}

def get_terms_descendants(self, term_ids: List[str], include_self: bool = False) -> Dict[str, List[str]]:
"""
Get the descendant ontology terms for each term in a list. If include_self is True, the term itself will be included
as a descendant.
Example: get_terms_descendants(["CL:0000003", "CL:0000005"], include_self=True) -> {
"CL:0000003": ["CL:0000003", "CL:0000004", ...],
"CL:0000005": ["CL:0000005", "CL:0002363", ...]
}
:param term_ids: list of str ontology terms to find descendants for
:param include_self: boolean flag to include the term itself as an descendant
:return: Dictionary mapping str term IDs to their respective flattened List[str] of descendant terms. Maps to empty
list if there are no descendants.
"""
descendants_dict = dict()
ontology_names = set()
for term_id in term_ids:
ontology_name = self._parse_ontology_name(term_id)
descendants_dict[term_id] = [term_id] if include_self else []
ontology_names.add(ontology_name)

for ontology in ontology_names:
for candidate_descendant, candidate_metadata in self.ontology_dict[ontology].items():
for ancestor_id in descendants_dict:
if ancestor_id in candidate_metadata["ancestors"]:
descendants_dict[ancestor_id].append(candidate_descendant)

return descendants_dict

def is_term_deprecated(self, term_id: str) -> bool:
"""
Check if an ontology term is deprecated.
Example: is_term_deprecated("CL:0000003") -> True
:param term_id: str ontology term to check for deprecation
:return: boolean flag indicating whether the term is deprecated
"""
ontology_name = self._parse_ontology_name(term_id)
is_deprecated: bool = self.ontology_dict[ontology_name][term_id].get("deprecated")
return is_deprecated

def get_term_replacement(self, term_id: str) -> Union[str, None]:
"""
Fetch the replacement term for a deprecated ontology term, if a replacement exists. Return None otherwise.
Example: get_term_replacement("CL:0000003") -> "CL:0000000"
:param term_id: str ontology term to check a replacement term for
:return: replacement str term ID if it exists, None otherwise
"""
ontology_name = self._parse_ontology_name(term_id)
replaced_by: str = self.ontology_dict[ontology_name][term_id].get("replaced_by")
return replaced_by if replaced_by else None

def get_term_metadata(self, term_id: str) -> Dict[str, Any]:
"""
Fetch metadata for a given ontology term. Returns a dict with format
{"comments": ["...", ...], "term_tracker": "...", "consider": ["...", ...]}
Comments maps to List[str] of ontology curator comments
Term Tracker maps to a str url where there is discussion around this term's curation (or deprecation).
Consider maps to List[str] of alternate ontology terms to consider using instead of this term
All keys map to None if no metadata of that type is present.
:param term_id: str ontology term to fetch metadata for
:return: Dict with keys 'Comments', 'Term Tracker', and 'Consider' containing associated metadata.
"""
ontology_name = self._parse_ontology_name(term_id)
return {
key: self.ontology_dict[ontology_name][term_id].get(key, None)
for key in {"comments", "term_tracker", "consider"}
}

def get_term_label(self, term_id: str) -> str:
"""
Fetch the human-readable label for a given ontology term.
Example: get_term_label("CL:0000005") -> "fibroblast neural crest derived"
:param term_id: str ontology term to fetch label for
:return: str human-readable label for the term
"""
ontology_name = self._parse_ontology_name(term_id)
label: str = self.ontology_dict[ontology_name][term_id]["label"]
return label
19 changes: 19 additions & 0 deletions api/python/tests/test_artifact_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import os

import pytest
from cellxgene_ontology_guide.artifact_download import load_artifact_by_schema
from cellxgene_ontology_guide.constants import ARTIFACT_DIR, CURRENT_SCHEMA_VERSION


def test_load_artifact_by_schema():
assert load_artifact_by_schema(CURRENT_SCHEMA_VERSION, "ontology_info.yml") == os.path.join(
ARTIFACT_DIR, "ontology_info.yml"
)
assert load_artifact_by_schema(CURRENT_SCHEMA_VERSION, "all_ontology.json.gz") == os.path.join(
ARTIFACT_DIR, "all_ontology.json.gz"
)


def test_load_artifact_by_schema_raises_value_error():
with pytest.raises(ValueError):
load_artifact_by_schema("0.0.0", "ontology_info.yml")
112 changes: 112 additions & 0 deletions api/python/tests/test_ontology_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import pytest
from cellxgene_ontology_guide.ontology_parser import OntologyParser


@pytest.fixture(scope="module")
def ontology_dict():
return {
"CL": {
"CL:0000000": {"ancestors": [], "label": "cell A", "deprecated": False},
"CL:0000001": {
"ancestors": ["CL:0000000"],
"label": "cell B",
"deprecated": False,
"consider": ["CL:0000004"],
},
"CL:0000002": {"ancestors": ["CL:0000000"], "label": "cell C", "deprecated": False},
"CL:0000003": {
"ancestors": ["CL:0000000"],
"label": "obsolete cell",
"deprecated": True,
"replaced_by": "CL:0000004",
"comments": ["this term was deprecated in favor of a descendant term of CL:0000001"],
"term_tracker": "http://example.com/issue/1234",
},
"CL:0000004": {"ancestors": ["CL:0000001", "CL:0000000"], "label": "cell B2", "deprecated": False},
}
}


@pytest.fixture(scope="module")
def supported_ontologies():
return {"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}}


@pytest.fixture(scope="module")
def ontology_parser(ontology_dict, supported_ontologies):
parser = OntologyParser()
parser.ontology_dict = ontology_dict
parser.supported_ontologies = supported_ontologies
return parser


def test_parse_ontology_name(ontology_parser):
assert ontology_parser._parse_ontology_name("CL:0000001") == "CL"


def test_parse_ontology_name__wrong_format(ontology_parser):
with pytest.raises(ValueError):
ontology_parser._parse_ontology_name("CL_0000001")


def test_parse_ontology_name__not_supported(ontology_parser):
with pytest.raises(ValueError):
ontology_parser._parse_ontology_name("GO:0000001")


def test_get_term_ancestors(ontology_parser):
assert ontology_parser.get_term_ancestors("CL:0000004") == ["CL:0000001", "CL:0000000"]
assert ontology_parser.get_term_ancestors("CL:0000004", include_self=True) == [
"CL:0000001",
"CL:0000000",
"CL:0000004",
]


def test_get_term_list_ancestors(ontology_parser):
assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"]) == {
"CL:0000000": [],
"CL:0000004": ["CL:0000001", "CL:0000000"],
}
assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"], include_self=True) == {
"CL:0000000": ["CL:0000000"],
"CL:0000004": ["CL:0000001", "CL:0000000", "CL:0000004"],
}


def test_get_terms_descendants(ontology_parser):
assert ontology_parser.get_terms_descendants(["CL:0000000", "CL:0000004"]) == {
"CL:0000000": ["CL:0000001", "CL:0000002", "CL:0000003", "CL:0000004"],
"CL:0000004": [],
}
assert ontology_parser.get_terms_descendants(["CL:0000000", "CL:0000004"], include_self=True) == {
"CL:0000000": ["CL:0000000", "CL:0000001", "CL:0000002", "CL:0000003", "CL:0000004"],
"CL:0000004": ["CL:0000004"],
}


def test_is_term_deprecated(ontology_parser):
assert ontology_parser.is_term_deprecated("CL:0000003")
assert not ontology_parser.is_term_deprecated("CL:0000004")


def test_get_term_replacement(ontology_parser):
assert ontology_parser.get_term_replacement("CL:0000003") == "CL:0000004"
assert ontology_parser.get_term_replacement("CL:0000004") is None


def test_get_term_metadata(ontology_parser):
assert ontology_parser.get_term_metadata("CL:0000003") == {
"comments": ["this term was deprecated in favor of a descendant term of CL:0000001"],
"term_tracker": "http://example.com/issue/1234",
"consider": None,
}
assert ontology_parser.get_term_metadata("CL:0000001") == {
"comments": None,
"term_tracker": None,
"consider": ["CL:0000004"],
}


def test_get_term_label(ontology_parser):
assert ontology_parser.get_term_label("CL:0000004") == "cell B2"
6 changes: 4 additions & 2 deletions tools/ontology-builder/src/env.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(PACKAGE_ROOT)))
FIXTURES_ROOT = os.path.join(ROOT_DIR, "api/python/src/cellxgene_ontology_guide/artifacts")
ONTOLOGY_REF_DIR = os.path.join(PACKAGE_ROOT, "ontology-references")
RAW_ONTOLOGY_DIR = os.path.join(ONTOLOGY_REF_DIR, "raw-files")
ONTO_INFO_YAML = os.path.join(ONTOLOGY_REF_DIR, "ontology_info.yml")
PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_REF_DIR, "all_ontology.json.gz")
ONTO_INFO_YAML = os.path.join(FIXTURES_ROOT, "ontology_info.yml")
PARSED_ONTOLOGIES_FILE = os.path.join(FIXTURES_ROOT, "all_ontology.json.gz")
SCHEMA_DIR = os.path.join(os.path.realpath(__file__).rsplit("/", maxsplit=4)[0], "artifact-schemas")

0 comments on commit 239ef2b

Please sign in to comment.