From dbcdd297e2f06ca0f833cb293a7bc584d06ae738 Mon Sep 17 00:00:00 2001 From: Trent Smith <1429913+Bento007@users.noreply.github.com> Date: Mon, 21 Oct 2024 11:28:36 -0700 Subject: [PATCH] feat: prototype to support multiple prefixes (#225) --- .github/workflows/push-tests.yml | 2 + asset-schemas/ontology_info_schema.json | 7 + ontology-assets/ontology_info.json | 2 +- .../src/all_ontology_generator.py | 19 +-- .../tests/test_all_ontology_generator.py | 129 ++++++++++++++++++ 5 files changed, 149 insertions(+), 10 deletions(-) diff --git a/.github/workflows/push-tests.yml b/.github/workflows/push-tests.yml index 0cfda347..6cfd3f5e 100644 --- a/.github/workflows/push-tests.yml +++ b/.github/workflows/push-tests.yml @@ -59,6 +59,7 @@ jobs: name: coverage-builder path: /home/runner/work/cellxgene-ontology-guide/cellxgene-ontology-guide/.coverage* retention-days: 3 + include-hidden-files: true unit-test-python-api: runs-on: ubuntu-latest @@ -92,6 +93,7 @@ jobs: name: coverage-api path: /home/runner/work/cellxgene-ontology-guide/cellxgene-ontology-guide/.coverage* retention-days: 3 + include-hidden-files: true submit-codecoverage: needs: diff --git a/asset-schemas/ontology_info_schema.json b/asset-schemas/ontology_info_schema.json index cded49f3..10cf7818 100644 --- a/asset-schemas/ontology_info_schema.json +++ b/asset-schemas/ontology_info_schema.json @@ -67,6 +67,13 @@ "filename": { "type": "string", "description": "name of ontology file used to build generated artifacts for this ontology data release" + }, + "additional_ontologies": { + "type": "array", + "items": { + "type": "string" + }, + "description": "List of additional term id prefixes to extracted from the source ontology file." } }, "required": [ diff --git a/ontology-assets/ontology_info.json b/ontology-assets/ontology_info.json index 1031c2f2..217658b4 100644 --- a/ontology-assets/ontology_info.json +++ b/ontology-assets/ontology_info.json @@ -148,4 +148,4 @@ }, "deprecated_on": "2024-05-10" } -} \ No newline at end of file +} diff --git a/tools/ontology-builder/src/all_ontology_generator.py b/tools/ontology-builder/src/all_ontology_generator.py index 2882faa3..064c65d6 100755 --- a/tools/ontology-builder/src/all_ontology_generator.py +++ b/tools/ontology-builder/src/all_ontology_generator.py @@ -116,14 +116,14 @@ def _load_ontology_object(onto_file: str) -> owlready2.entity.ThingClass: return onto -def _get_ancestors(onto_class: owlready2.entity.ThingClass, onto_name: str) -> Dict[str, int]: +def _get_ancestors(onto_class: owlready2.entity.ThingClass, allowed_ontologies: list[str]) -> Dict[str, int]: """ Returns a list of unique ancestor ontology term ids of the given onto class. Only returns those belonging to ontology_name, it will format the id from the form CL_xxxx to CL:xxxx. Ancestors are returned in ascending order of distance from the given term. :param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved - :param str onto_name: only ancestors from this ontology will be kept + :param listp[str] allowed_ontologies: only ancestors from these ontologies will be kept :rtype List[str] :return list of ancestors (term ids), it could be empty @@ -144,7 +144,7 @@ def _get_ancestors(onto_class: owlready2.entity.ThingClass, onto_name: str) -> D ancestors[branch_ancestor_name] = min(ancestors[branch_ancestor_name], distance) else: queue.append((parent.value, distance + 1)) - if branch_ancestor_name.split(":")[0] == onto_name: + if branch_ancestor_name.split(":")[0] in allowed_ontologies: ancestors[branch_ancestor_name] = distance elif hasattr(parent, "name") and not hasattr(parent, "Classes"): parent_name = parent.name.replace("_", ":") @@ -158,15 +158,16 @@ def _get_ancestors(onto_class: owlready2.entity.ThingClass, onto_name: str) -> D return { ancestor: distance for ancestor, distance in sorted(ancestors.items(), key=lambda item: item[1]) - if ancestor.split(":")[0] == onto_name + if ancestor.split(":")[0] in allowed_ontologies } -def _extract_ontology_term_metadata(onto: owlready2.entity.ThingClass) -> Dict[str, Any]: +def _extract_ontology_term_metadata(onto: owlready2.entity.ThingClass, allowed_ontologies: list[str]) -> Dict[str, Any]: """ Extract relevant metadata from ontology object and save into a dictionary following our JSON Schema :param: onto: Ontology Object to Process + :param: allowed_ontologies: List of term prefixes to filter out terms that are not direct children from this ontology :return: Dict[str, Any] map of ontology term IDs to pertinent metadata from ontology files """ term_dict: Dict[str, Any] = dict() @@ -174,10 +175,10 @@ def _extract_ontology_term_metadata(onto: owlready2.entity.ThingClass) -> Dict[s term_id = onto_term.name.replace("_", ":") # Skip terms that are not direct children from this ontology - if onto.name != term_id.split(":")[0]: + if term_id.split(":")[0] not in allowed_ontologies: continue # Gets ancestors - ancestors = _get_ancestors(onto_term, onto.name) + ancestors = _get_ancestors(onto_term, allowed_ontologies) # Special Case: skip the current term if it is an NCBI Term, but not a descendant of 'NCBITaxon:33208'. if onto.name == "NCBITaxon" and "NCBITaxon:33208" not in ancestors: @@ -266,8 +267,8 @@ def _parse_ontologies( version = ontology_info[onto.name]["version"] output_file = os.path.join(output_path, get_ontology_file_name(onto.name, version)) logging.info(f"Processing {output_file}") - - onto_dict = _extract_ontology_term_metadata(onto) + allowed_ontologies = [onto.name] + ontology_info[onto.name].get("additional_ontologies", []) + onto_dict = _extract_ontology_term_metadata(onto, allowed_ontologies) with gzip.GzipFile(output_file, mode="wb", mtime=0) as fp: fp.write(json.dumps(onto_dict, indent=2).encode("utf-8")) diff --git a/tools/ontology-builder/tests/test_all_ontology_generator.py b/tools/ontology-builder/tests/test_all_ontology_generator.py index f1b547b1..090cb4d9 100644 --- a/tools/ontology-builder/tests/test_all_ontology_generator.py +++ b/tools/ontology-builder/tests/test_all_ontology_generator.py @@ -7,6 +7,7 @@ import pytest from all_ontology_generator import ( _download_ontologies, + _extract_ontology_term_metadata, _parse_ontologies, deprecate_previous_cellxgene_schema_versions, get_ontology_info_file, @@ -45,6 +46,16 @@ def mock_raw_ontology_dir(tmpdir): return str(sub_dir) +@pytest.fixture +def mock_owl(tmpdir): + import owlready2 + + onto = owlready2.get_ontology("http://example.com/ontology_name.owl") + onto.name = "FAKE" + + return onto + + def test_get_ontology_info_file_default(mock_ontology_info_file): # Call the function ontology_info = get_ontology_info_file(ontology_info_file=mock_ontology_info_file) @@ -224,3 +235,121 @@ def test_deprecate_previous_cellxgene_schema_versions(mock_datetime): deprecate_previous_cellxgene_schema_versions(ontology_info, "v1") assert ontology_info == expected_ontology_info + + +@pytest.fixture +def sample_ontology(tmp_path): + # Create a new ontology + import owlready2 + + onto = owlready2.get_ontology("http://test.org/onto.owl") + onto.name = "FOO" + + with onto: + + class FOO_000001(owlready2.Thing): + label = ["Test Root Term"] + + class FOO_000002(FOO_000001): + label = ["Test Deprecated Descendant Term"] + IAO_0000115 = ["Test description"] + hasExactSynonym = ["Test synonym"] + deprecated = [True] + comment = ["Deprecated term", "See Links for more details"] + IAO_0000233 = ["http://example.org/term_tracker"] + IAO_0100001 = ["http://ontology.org/FOO_000003"] + + class FOO_000003(FOO_000001): + label = ["Test Non-Deprecated Descendant Term"] + + class OOF_000001(owlready2.Thing): + label = ["Test Unrelated Different Ontology Term"] + + class OOF_000002(FOO_000001): + label = ["Test Descendant Different Ontology Term"] + + class FOO_000004(OOF_000002, FOO_000003): + label = ["Test Ontology Term With Different Ontology Ancestors"] + + onto.save(file=str(tmp_path.joinpath("test_ontology.owl"))) + return onto + + +def test_extract_ontology_term_metadata(sample_ontology): + allowed_ontologies = ["FOO"] + result = _extract_ontology_term_metadata(sample_ontology, allowed_ontologies) + + expected_result = { + "FOO:000001": { + "ancestors": {}, + "label": "Test Root Term", + "deprecated": False, + }, + "FOO:000002": { + "ancestors": {"FOO:000001": 1}, + "label": "Test Deprecated Descendant Term", + "description": "Test description", + "synonyms": ["Test synonym"], + "deprecated": True, + "comments": ["Deprecated term", "See Links for more details"], + "term_tracker": "http://example.org/term_tracker", + "replaced_by": "FOO:000003", + }, + "FOO:000003": { + "ancestors": {"FOO:000001": 1}, + "label": "Test Non-Deprecated Descendant Term", + "deprecated": False, + }, + "FOO:000004": { + "ancestors": {"FOO:000001": 2, "FOO:000003": 1}, + "label": "Test Ontology Term With Different Ontology Ancestors", + "deprecated": False, + }, + } + + assert result == expected_result + + +def test_extract_ontology_term_metadata_multiple_allowed_ontologies(sample_ontology): + allowed_ontologies = ["FOO", "OOF"] + result = _extract_ontology_term_metadata(sample_ontology, allowed_ontologies) + + expected_result = { + "FOO:000001": { + "ancestors": {}, + "label": "Test Root Term", + "deprecated": False, + }, + "FOO:000002": { + "ancestors": {"FOO:000001": 1}, + "label": "Test Deprecated Descendant Term", + "description": "Test description", + "synonyms": ["Test synonym"], + "deprecated": True, + "comments": ["Deprecated term", "See Links for more details"], + "term_tracker": "http://example.org/term_tracker", + "replaced_by": "FOO:000003", + }, + "FOO:000003": { + "ancestors": {"FOO:000001": 1}, + "label": "Test Non-Deprecated Descendant Term", + "deprecated": False, + }, + "FOO:000004": { + "ancestors": {"FOO:000001": 2, "FOO:000003": 1, "OOF:000002": 1}, + "label": "Test Ontology Term With Different Ontology Ancestors", + "deprecated": False, + }, + "OOF:000001": { + "ancestors": {}, + "label": "Test Unrelated Different Ontology Term", + "deprecated": False, + }, + "OOF:000002": { + "ancestors": {"FOO:000001": 1}, + "label": "Test Descendant Different Ontology Term", + "deprecated": False, + }, + } + + assert result == expected_result