-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add API ontology querying module (#39)
## Reason for Change - #14 ## Changes - implement all_ontology querying functions to fetch metadata, for single term IDs and bulk IDs - reorganize file structure to store artifacts that will be packaged with the API in the python/api directory, rather than in the ontology-builder directory (which will not be packaged). - In turn, reorganize the env.py constants + GHAs pointing to directories to read from / output generated files to. ## Testing steps - unit tests ## Notes for Reviewer
- Loading branch information
Showing
11 changed files
with
336 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
19 changes: 19 additions & 0 deletions
19
api/python/src/cellxgene_ontology_guide/artifact_download.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import os | ||
|
||
from constants import ARTIFACT_DIR, CURRENT_SCHEMA_VERSION | ||
|
||
|
||
def load_artifact_by_schema(schema_version: str, filename: str) -> str: | ||
""" | ||
Load ontology files from GitHub Release Assets, based on the provided schema version. | ||
Returns ValueError if the schema version is not supported in this package version. | ||
:param schema_version: str version of the schema to load ontology files for | ||
:param filename: str name of the file to load | ||
:return: str path to the ontology file | ||
""" | ||
if schema_version == CURRENT_SCHEMA_VERSION: | ||
return os.path.join(ARTIFACT_DIR, filename) | ||
else: | ||
# TODO: Add support for loading ontology files from different schema versions | ||
raise ValueError(f"Schema version {schema_version} is not supported in this package version.") |
Binary file renamed
BIN
+14.1 MB
.../ontology-references/all_ontology.json.gz → ...logy_guide/artifacts/all_ontology.json.gz
Binary file not shown.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
import os | ||
|
||
PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__)) | ||
ARTIFACT_DIR = os.path.join(PACKAGE_ROOT, "artifacts") | ||
ALL_ONTOLOGY_FILENAME = "all_ontology.json.gz" | ||
ONTOLOGY_INFO_FILENAME = "ontology_info.yml" | ||
CURRENT_SCHEMA_VERSION = "5.0.0" |
172 changes: 172 additions & 0 deletions
172
api/python/src/cellxgene_ontology_guide/ontology_parser.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,172 @@ | ||
import gzip | ||
import json | ||
import re | ||
from typing import Any, Dict, List, Union | ||
|
||
import yaml | ||
from artifact_download import load_artifact_by_schema | ||
from constants import ALL_ONTOLOGY_FILENAME, CURRENT_SCHEMA_VERSION, ONTOLOGY_INFO_FILENAME | ||
|
||
|
||
class OntologyParser: | ||
""" | ||
An object to parse ontology term metadata from ontologies corresponding to a given CellxGene Schema Version. | ||
""" | ||
|
||
def __init__(self, schema_version: str = CURRENT_SCHEMA_VERSION): | ||
""" | ||
Initialize an OntologyParser object with the ontology metadata corresponding to the given CellxGene schema | ||
version. By default, loads the ontology metadata for the latest compatible schema version from disk. If a | ||
different schema version is set, the corresponding ontology metadata will be loaded instead. If not available | ||
from disk, it will make a network call to GitHub Release Assets. | ||
:param schema_version: str version of the schema to load ontology metadata for | ||
""" | ||
all_ontology_filepath = load_artifact_by_schema(schema_version, ALL_ONTOLOGY_FILENAME) | ||
ontology_info_filepath = load_artifact_by_schema(schema_version, ONTOLOGY_INFO_FILENAME) | ||
|
||
with gzip.open(all_ontology_filepath, "rt") as f: | ||
self.ontology_dict = json.load(f) | ||
|
||
with open(ontology_info_filepath, "rt") as f: | ||
self.supported_ontologies = yaml.safe_load(f) | ||
|
||
def _parse_ontology_name(self, term_id: str) -> str: | ||
""" | ||
Parse the ontology name from a given term ID. If the term ID does not conform to the expected term format or is not | ||
from an ontology supported by cellxgene-ontology-guide, raise a ValueError. | ||
:param term_id: str ontology term to parse | ||
:return: str name of ontology that term belongs to | ||
""" | ||
pattern = r"[A-Za-z]+:\d+" | ||
if not re.match(pattern, term_id): | ||
raise ValueError(f"{term_id} does not conform to expected regex pattern {pattern} and cannot be queried.") | ||
|
||
ontology_name = term_id.split(":")[0] | ||
if ontology_name not in self.supported_ontologies: | ||
raise ValueError(f"{term_id} is not part of a supported ontology, its metadata cannot be fetched.") | ||
|
||
return ontology_name | ||
|
||
def get_term_ancestors(self, term_id: str, include_self: bool = False) -> List[str]: | ||
""" | ||
Get the ancestor ontology terms for a given term. If include_self is True, the term itself will be included as an | ||
ancestor. | ||
Example: get_term_ancestors("CL:0000005") -> ["CL:0000000", ...] | ||
:param term_id: str ontology term to find ancestors for | ||
:param include_self: boolean flag to include the term itself as an ancestor | ||
:return: flattened List[str] of ancestor terms | ||
""" | ||
ontology_name = self._parse_ontology_name(term_id) | ||
ancestors: List[str] = self.ontology_dict[ontology_name][term_id]["ancestors"] | ||
return ancestors + [term_id] if include_self else ancestors | ||
|
||
def get_term_list_ancestors(self, term_ids: str, include_self: bool = False) -> Dict[str, List[str]]: | ||
""" | ||
Get the ancestor ontology terms for each term in a list. If include_self is True, the term itself will be included | ||
as an ancestor. | ||
Example: get_term_list_ancestors(["CL:0000003", "CL:0000005"], include_self=True) -> { | ||
"CL:0000003": ["CL:0000003"], | ||
"CL:0000005": ["CL:0000005", "CL:0000000", ...] | ||
} | ||
:param term_ids: list of str ontology terms to find ancestors for | ||
:param include_self: boolean flag to include the term itself as an ancestor | ||
:return: Dictionary mapping str term IDs to their respective flattened List[str] of ancestor terms. Maps to empty | ||
list if there are no ancestors. | ||
""" | ||
return {term_id: self.get_term_ancestors(term_id, include_self) for term_id in term_ids} | ||
|
||
def get_terms_descendants(self, term_ids: List[str], include_self: bool = False) -> Dict[str, List[str]]: | ||
""" | ||
Get the descendant ontology terms for each term in a list. If include_self is True, the term itself will be included | ||
as a descendant. | ||
Example: get_terms_descendants(["CL:0000003", "CL:0000005"], include_self=True) -> { | ||
"CL:0000003": ["CL:0000003", "CL:0000004", ...], | ||
"CL:0000005": ["CL:0000005", "CL:0002363", ...] | ||
} | ||
:param term_ids: list of str ontology terms to find descendants for | ||
:param include_self: boolean flag to include the term itself as an descendant | ||
:return: Dictionary mapping str term IDs to their respective flattened List[str] of descendant terms. Maps to empty | ||
list if there are no descendants. | ||
""" | ||
descendants_dict = dict() | ||
ontology_names = set() | ||
for term_id in term_ids: | ||
ontology_name = self._parse_ontology_name(term_id) | ||
descendants_dict[term_id] = [term_id] if include_self else [] | ||
ontology_names.add(ontology_name) | ||
|
||
for ontology in ontology_names: | ||
for candidate_descendant, candidate_metadata in self.ontology_dict[ontology].items(): | ||
for ancestor_id in descendants_dict: | ||
if ancestor_id in candidate_metadata["ancestors"]: | ||
descendants_dict[ancestor_id].append(candidate_descendant) | ||
|
||
return descendants_dict | ||
|
||
def is_term_deprecated(self, term_id: str) -> bool: | ||
""" | ||
Check if an ontology term is deprecated. | ||
Example: is_term_deprecated("CL:0000003") -> True | ||
:param term_id: str ontology term to check for deprecation | ||
:return: boolean flag indicating whether the term is deprecated | ||
""" | ||
ontology_name = self._parse_ontology_name(term_id) | ||
is_deprecated: bool = self.ontology_dict[ontology_name][term_id].get("deprecated") | ||
return is_deprecated | ||
|
||
def get_term_replacement(self, term_id: str) -> Union[str, None]: | ||
""" | ||
Fetch the replacement term for a deprecated ontology term, if a replacement exists. Return None otherwise. | ||
Example: get_term_replacement("CL:0000003") -> "CL:0000000" | ||
:param term_id: str ontology term to check a replacement term for | ||
:return: replacement str term ID if it exists, None otherwise | ||
""" | ||
ontology_name = self._parse_ontology_name(term_id) | ||
replaced_by: str = self.ontology_dict[ontology_name][term_id].get("replaced_by") | ||
return replaced_by if replaced_by else None | ||
|
||
def get_term_metadata(self, term_id: str) -> Dict[str, Any]: | ||
""" | ||
Fetch metadata for a given ontology term. Returns a dict with format | ||
{"comments": ["...", ...], "term_tracker": "...", "consider": ["...", ...]} | ||
Comments maps to List[str] of ontology curator comments | ||
Term Tracker maps to a str url where there is discussion around this term's curation (or deprecation). | ||
Consider maps to List[str] of alternate ontology terms to consider using instead of this term | ||
All keys map to None if no metadata of that type is present. | ||
:param term_id: str ontology term to fetch metadata for | ||
:return: Dict with keys 'Comments', 'Term Tracker', and 'Consider' containing associated metadata. | ||
""" | ||
ontology_name = self._parse_ontology_name(term_id) | ||
return { | ||
key: self.ontology_dict[ontology_name][term_id].get(key, None) | ||
for key in {"comments", "term_tracker", "consider"} | ||
} | ||
|
||
def get_term_label(self, term_id: str) -> str: | ||
""" | ||
Fetch the human-readable label for a given ontology term. | ||
Example: get_term_label("CL:0000005") -> "fibroblast neural crest derived" | ||
:param term_id: str ontology term to fetch label for | ||
:return: str human-readable label for the term | ||
""" | ||
ontology_name = self._parse_ontology_name(term_id) | ||
label: str = self.ontology_dict[ontology_name][term_id]["label"] | ||
return label |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
import os | ||
|
||
import pytest | ||
from cellxgene_ontology_guide.artifact_download import load_artifact_by_schema | ||
from cellxgene_ontology_guide.constants import ARTIFACT_DIR, CURRENT_SCHEMA_VERSION | ||
|
||
|
||
def test_load_artifact_by_schema(): | ||
assert load_artifact_by_schema(CURRENT_SCHEMA_VERSION, "ontology_info.yml") == os.path.join( | ||
ARTIFACT_DIR, "ontology_info.yml" | ||
) | ||
assert load_artifact_by_schema(CURRENT_SCHEMA_VERSION, "all_ontology.json.gz") == os.path.join( | ||
ARTIFACT_DIR, "all_ontology.json.gz" | ||
) | ||
|
||
|
||
def test_load_artifact_by_schema_raises_value_error(): | ||
with pytest.raises(ValueError): | ||
load_artifact_by_schema("0.0.0", "ontology_info.yml") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
import pytest | ||
from cellxgene_ontology_guide.ontology_parser import OntologyParser | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def ontology_dict(): | ||
return { | ||
"CL": { | ||
"CL:0000000": {"ancestors": [], "label": "cell A", "deprecated": False}, | ||
"CL:0000001": { | ||
"ancestors": ["CL:0000000"], | ||
"label": "cell B", | ||
"deprecated": False, | ||
"consider": ["CL:0000004"], | ||
}, | ||
"CL:0000002": {"ancestors": ["CL:0000000"], "label": "cell C", "deprecated": False}, | ||
"CL:0000003": { | ||
"ancestors": ["CL:0000000"], | ||
"label": "obsolete cell", | ||
"deprecated": True, | ||
"replaced_by": "CL:0000004", | ||
"comments": ["this term was deprecated in favor of a descendant term of CL:0000001"], | ||
"term_tracker": "http://example.com/issue/1234", | ||
}, | ||
"CL:0000004": {"ancestors": ["CL:0000001", "CL:0000000"], "label": "cell B2", "deprecated": False}, | ||
} | ||
} | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def supported_ontologies(): | ||
return {"CL": {"version": "2024-01-01", "source": "http://example.com", "filetype": "owl"}} | ||
|
||
|
||
@pytest.fixture(scope="module") | ||
def ontology_parser(ontology_dict, supported_ontologies): | ||
parser = OntologyParser() | ||
parser.ontology_dict = ontology_dict | ||
parser.supported_ontologies = supported_ontologies | ||
return parser | ||
|
||
|
||
def test_parse_ontology_name(ontology_parser): | ||
assert ontology_parser._parse_ontology_name("CL:0000001") == "CL" | ||
|
||
|
||
def test_parse_ontology_name__wrong_format(ontology_parser): | ||
with pytest.raises(ValueError): | ||
ontology_parser._parse_ontology_name("CL_0000001") | ||
|
||
|
||
def test_parse_ontology_name__not_supported(ontology_parser): | ||
with pytest.raises(ValueError): | ||
ontology_parser._parse_ontology_name("GO:0000001") | ||
|
||
|
||
def test_get_term_ancestors(ontology_parser): | ||
assert ontology_parser.get_term_ancestors("CL:0000004") == ["CL:0000001", "CL:0000000"] | ||
assert ontology_parser.get_term_ancestors("CL:0000004", include_self=True) == [ | ||
"CL:0000001", | ||
"CL:0000000", | ||
"CL:0000004", | ||
] | ||
|
||
|
||
def test_get_term_list_ancestors(ontology_parser): | ||
assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"]) == { | ||
"CL:0000000": [], | ||
"CL:0000004": ["CL:0000001", "CL:0000000"], | ||
} | ||
assert ontology_parser.get_term_list_ancestors(["CL:0000000", "CL:0000004"], include_self=True) == { | ||
"CL:0000000": ["CL:0000000"], | ||
"CL:0000004": ["CL:0000001", "CL:0000000", "CL:0000004"], | ||
} | ||
|
||
|
||
def test_get_terms_descendants(ontology_parser): | ||
assert ontology_parser.get_terms_descendants(["CL:0000000", "CL:0000004"]) == { | ||
"CL:0000000": ["CL:0000001", "CL:0000002", "CL:0000003", "CL:0000004"], | ||
"CL:0000004": [], | ||
} | ||
assert ontology_parser.get_terms_descendants(["CL:0000000", "CL:0000004"], include_self=True) == { | ||
"CL:0000000": ["CL:0000000", "CL:0000001", "CL:0000002", "CL:0000003", "CL:0000004"], | ||
"CL:0000004": ["CL:0000004"], | ||
} | ||
|
||
|
||
def test_is_term_deprecated(ontology_parser): | ||
assert ontology_parser.is_term_deprecated("CL:0000003") | ||
assert not ontology_parser.is_term_deprecated("CL:0000004") | ||
|
||
|
||
def test_get_term_replacement(ontology_parser): | ||
assert ontology_parser.get_term_replacement("CL:0000003") == "CL:0000004" | ||
assert ontology_parser.get_term_replacement("CL:0000004") is None | ||
|
||
|
||
def test_get_term_metadata(ontology_parser): | ||
assert ontology_parser.get_term_metadata("CL:0000003") == { | ||
"comments": ["this term was deprecated in favor of a descendant term of CL:0000001"], | ||
"term_tracker": "http://example.com/issue/1234", | ||
"consider": None, | ||
} | ||
assert ontology_parser.get_term_metadata("CL:0000001") == { | ||
"comments": None, | ||
"term_tracker": None, | ||
"consider": ["CL:0000004"], | ||
} | ||
|
||
|
||
def test_get_term_label(ontology_parser): | ||
assert ontology_parser.get_term_label("CL:0000004") == "cell B2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,10 @@ | ||
import os | ||
|
||
PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__)) | ||
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(PACKAGE_ROOT))) | ||
FIXTURES_ROOT = os.path.join(ROOT_DIR, "api/python/src/cellxgene_ontology_guide/artifacts") | ||
ONTOLOGY_REF_DIR = os.path.join(PACKAGE_ROOT, "ontology-references") | ||
RAW_ONTOLOGY_DIR = os.path.join(ONTOLOGY_REF_DIR, "raw-files") | ||
ONTO_INFO_YAML = os.path.join(ONTOLOGY_REF_DIR, "ontology_info.yml") | ||
PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_REF_DIR, "all_ontology.json.gz") | ||
ONTO_INFO_YAML = os.path.join(FIXTURES_ROOT, "ontology_info.yml") | ||
PARSED_ONTOLOGIES_FILE = os.path.join(FIXTURES_ROOT, "all_ontology.json.gz") | ||
SCHEMA_DIR = os.path.join(os.path.realpath(__file__).rsplit("/", maxsplit=4)[0], "artifact-schemas") |