Feature: Exclusion tables

- Updates to existing tables: Added header, including missing columns, but did not yet populate data in those columns. - Added new tables for the rest of the ontologies. - Added Python script to work w/ these tables: term_range_expansion.py (WIP) - Added SPARQL scripts to be called by Python script. - Added make goal to call Python script.
monarch-initiative · Jun 22, 2022 · d96a4d2 · d96a4d2
1 parent 1d862bc
commit d96a4d2
Show file tree

Hide file tree

Showing 11 changed files with 1,462 additions and 2 deletions.
diff --git a/src/ontology/config/doid_exclusions.tsv b/src/ontology/config/doid_exclusions.tsv
@@ -1,3 +1,4 @@
+term_id	term_label	exclusion_reason	exclude_children
 DOID:0040001	shrimp allergy
 DOID:0040002	aspirin allergy
 DOID:0040003	benzylpenicillin allergy

diff --git a/src/ontology/config/icd10cm_exclusions.tsv b/src/ontology/config/icd10cm_exclusions.tsv
diff --git a/src/ontology/config/icd10who_exclusions.tsv b/src/ontology/config/icd10who_exclusions.tsv
@@ -0,0 +1 @@
+term_id	term_label	exclusion_reason	exclude_children
diff --git a/src/ontology/config/ncit_exclusions.tsv b/src/ontology/config/ncit_exclusions.tsv
@@ -0,0 +1 @@
+term_id	term_label	exclusion_reason	exclude_children
diff --git a/src/ontology/config/omim_exclusions.tsv b/src/ontology/config/omim_exclusions.tsv
@@ -1,3 +1,4 @@
+term_id	term_label	exclusion_reason	exclude_children
 OMIM:108340	aryl hydrocarbon hydroxylase inducibility
 OMIM:113721	breast cancer-related regulator of tp53
 OMIM:123270	creatine kinase, brain type, ectopic expression of

diff --git a/src/ontology/config/ordo_exclusions.tsv b/src/ontology/config/ordo_exclusions.tsv
@@ -0,0 +1 @@
+term_id	term_label	exclusion_reason	exclude_children
diff --git a/src/ontology/config/snomed_exclusions.tsv b/src/ontology/config/snomed_exclusions.tsv
@@ -0,0 +1 @@
+term_id	term_label	exclusion_reason	exclude_children
diff --git a/src/ontology/mondo-ingest.Makefile b/src/ontology/mondo-ingest.Makefile
@@ -160,15 +160,17 @@ mappings: sssom $(ALL_MAPPINGS)
 #################
 # Utils #########
 #################
-# Documentation for this commands in this section is in: `docs/developer/ordo.md`
-
+# Documentation for `report-mapping-annotations` and `update-jinja-sparql-queries`: `docs/developer/ordo.md`
 report-mapping-annotations:
 	python3 $(SCRIPTSDIR)/ordo_mapping_annotations/report_mapping_annotations.py
 
 update-jinja-sparql-queries:
 	python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_replace_annotation_based_mappings.py
 	python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_mapping_annotations_violation.py
 
+config/%_term_exclusions.txt: config/%_term_exclusions.tsv mirror/%.owl
+	python3 ../scripts/term_range_expansion.py --exclusions-path $(word 1,$^) --onto-path $(word 2,$^)
+
 #################
 # Documentation #
 #################

diff --git a/src/scripts/term_range_expansion.py b/src/scripts/term_range_expansion.py
@@ -0,0 +1,207 @@
+"""Takes in the full ontology and the exclusions tsv and extracts a simple list of terms from it.
+Outputs two files:
+1. a simple list of terms (config/ONTO_NAME_term_exclusions.txt)
+2. a simple two column file with the term_id and the exclusion reason
+
+# Resources
+- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/22
+
+TODO's
+  - repurpose this: (i) to be generalized, (ii) to do the job of consuming and producing exclusion table as described in issue
+  - possible bug: ICD10CM:C7A-C7A reported as term, but should be ICD10CM:C7A
+"""
+import os
+import subprocess
+from argparse import ArgumentParser
+from copy import copy
+from typing import Any, Dict, List
+
+from jinja2 import Template
+import pandas as pd
+
+
+# Vars
+# # Config
+USE_CACHE = False
+
+# # Static
+PROJECT_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
+SPARQL_DIR = os.path.join(PROJECT_DIR, 'sparql')
+CACHE_DIR = os.path.join(PROJECT_DIR, 'cache')
+sparql_child_inclusion_path = os.path.join(PROJECT_DIR, 'sparql', 'get-terms-children.sparql.jinja2')
+sparql_child_exclusion_path = os.path.join(PROJECT_DIR, 'sparql', 'get-terms.sparql.jinja2')
+input_exclusion_term_ranges_csv_path = os.path.join(
+    PROJECT_DIR, 'preparation', 'icd10cm_excluded_term_ranges_intensional.csv')
+input_icd10cm_ttl_path = os.path.join(PROJECT_DIR, 'preparation', 'icd10cm.ttl')
+TERM_RANGE_MAX_VAL = 99
+
+
+# Functions
+# todo: pd.errors.EmptyDataError: I think this happens because Python process needs to stop before results saved?
+def sparql_jinja2_file_query__via_robot(
+    terms: List[str], query_template_path: str,
+    path_in_data_dir=os.path.basename(input_icd10cm_ttl_path), onto_path=input_icd10cm_ttl_path,
+    use_cache=USE_CACHE, include_superclass=False
+) -> pd.DataFrame:
+    """Query ontology using SPARQL query file"""
+    # Basic vars
+    query_template_filename = os.path.basename(query_template_path)
+    results_dirname = path_in_data_dir.replace('/', '-').replace('.', '-') + \
+        '__' + query_template_filename.replace('.', '-')
+    results_dirpath = os.path.join(CACHE_DIR, 'robot', results_dirname)
+    results_filename = 'results.csv'
+    command_save_filename = 'command.sh'
+    results_path = os.path.join(results_dirpath, results_filename)
+    command_save_path = os.path.join(results_dirpath, command_save_filename)
+    instantiated_query_path = os.path.join(results_dirpath, 'query.sparql')
+    command_str = f'robot query --input {onto_path} --query {instantiated_query_path} {results_path}'
+
+    # Instantiate template
+    with open(query_template_path, 'r') as f:
+        template_str = f.read()
+    template_obj = Template(template_str)
+    # https://www.w3.org/TR/sparql11-query/#propertypaths
+    instantiated_str = template_obj.render(
+        property_path_qualifier='*' if include_superclass else '+',
+        values=' '.join(terms))
+
+    # Cache and run
+    os.makedirs(results_dirpath, exist_ok=True)
+    if not (os.path.exists(results_path) and use_cache):
+        with open(instantiated_query_path, 'w') as f:
+            f.write(instantiated_str)
+        with open(command_save_path, 'w') as f:
+            f.write(command_str)
+        subprocess.run(command_str.split())
+
+    # Read results and return
+    try:
+        df = pd.read_csv(results_path).fillna('')
+    except pd.errors.EmptyDataError as err:
+        os.remove(results_path)  # done so that it doesn't read this from cache
+        raise err
+
+    return df
+
+
+def run(onto_path: str, exclusions_path: str) -> pd.DataFrame:
+    """Run"""
+    # TODO
+    #  1. query the exact terms listed in the exclusion table
+    #  2. regex
+    #  Lastly the table should support the following trick: If a value in term_label starts with REGEX:, the script
+    #  should search for all labels in the ontology and exclude them. For example:
+    #  example table:
+    #  term_id	term_label	exclusion_reason	exclude_children
+    #  | REGEX:^.*phenotype$ | phenotype | yes
+    #  Should match all labels then end in the word phenotype and then exclude all their children.
+
+    # TODO: Need output TSV:
+    # - Fields: `Term ID`, `Exclusion Reason`
+    # - First row only: `ID`, `AI rdfs:seeAlso`
+    # Prepare
+    df = pd.DataFrame()  # todo
+
+    df_included_kids = df[df['exclude_children'] == False]
+    df_excluded_kids = df[df['exclude_children'] == True]
+    # Remove duplicates using set()
+    terms_included_kids: List[str] = list(set(df_included_kids['term_id']))
+    terms_excluded_kids: List[str] = list(set(df_excluded_kids['term_id']))
+
+    # Query
+    # TODO: see if these jinja files are necessary. also, do they capture direct kids or all kids?
+    results_df_excluded_kids: pd.DataFrame = sparql_jinja2_file_query__via_robot(
+        terms_excluded_kids, sparql_child_exclusion_path)
+    results_df_included_kids: pd.DataFrame = sparql_jinja2_file_query__via_robot(
+        terms_included_kids, sparql_child_inclusion_path)
+
+    # Massage
+    # # Convert URI back to prefix
+    results_df_excluded_kids['term_id'] = results_df_excluded_kids['term_id'].apply(
+        lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:'))
+    results_df_included_kids['term_id'] = results_df_included_kids['term_id'].apply(
+        lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:'))
+    results_df_included_kids['child_id'] = results_df_included_kids['child_id'].apply(
+        lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:'))
+    # # Capture source information
+    # TODO attempt 2
+    xxx = pd.concat([results_df_included_kids, results_df_excluded_kids]).fillna('')
+    xxx['term_info_source'] = 'icd10cm.ttl'
+    # TODO: fix x/y (*Can I re-use `left_join_update()` from valueSetTools?)
+    # df['term_info_source'] = ''
+    # results_df_excluded_kids['term_info_source'] = 'icd10cm.ttl'
+    # results_df_included_kids['term_info_source'] = 'icd10cm.ttl'
+    # # Determine which factors were not in source
+    term_in_icd10cm_ttl = set(xxx['term_id'])
+    df['term_in_source_icd10cm.ttl'] = df['term_id'].apply(
+        lambda term_id: term_id in term_in_icd10cm_ttl)
+
+    # Join
+    # df2 = pd.merge(df, results_df_excluded_kids, how='left', on='term_id')
+    df3 = pd.merge(df, xxx, how='left', on='term_id').fillna('')
+    print(list(df3.columns)) # TODO temp
+
+    # Restructure
+    df4 = df3
+    df4['PK_unique_path'] = \
+        df4.apply(lambda row: row['term_range_id'].replace('ICD10CM:', '') + '_' + row['term_id'].replace(
+            'ICD10CM:', '') + '_' + row['child_id'].replace('ICD10CM:', ''), axis=1)
+    df4['PK_unique_path'] = df4['PK_unique_path'].apply(lambda x: x[:len(x) - 1] if x.endswith('_') else x)
+    df4 = df4.rename(columns={
+        'exclusion_reason': 'child_exclude_reason',
+        'exclude_children': 'child_exclude'})
+    df4 = df4[[
+        'PK_unique_path', 'term_range_id', 'term_range_label', 'term_id', 'term_label', 'child_id',
+        'child_label', 'term_info_source', 'term_in_source_icd10cm.ttl', 'child_exclude', 'child_exclude_reason']]
+    df4 = df4.sort_values(['PK_unique_path', 'term_range_id', 'term_id', 'child_id'])
+
+    # todo: change path
+    df4.to_csv('~/Desktop/results.csv', index=False)
+
+    return df4
+
+    return results
+
+
+def get_parser() -> ArgumentParser:
+    """Add required fields to parser."""
+    package_description = \
+        'Takes in a full ontology and an exclusions TSV and extracts a simple list of terms.\n'\
+        'Outputs two files:\n'\
+        '1. a simple list of terms (config/ONTO_NAME_term_exclusions.txt)\n'\
+        '2. a simple two column file with the term_id and the exclusion reason\n'
+    parser = ArgumentParser(description=package_description)
+
+    parser.add_argument(
+        '-o', '--onto-path',
+        help='Path to the ontology file to query.')
+
+    parser.add_argument(
+        '-e', '--exclusions-path',
+        help='Path to a TSV which should have the following fields: `term_id` (str), `term_label` (str), '
+             '`exclusion_reason` (str), and `exclude_children` (bool).')
+
+    return parser
+
+
+def cli():
+    """Command line interface."""
+    parser = get_parser()
+    kwargs = parser.parse_args()
+    kwargs_dict: Dict = vars(kwargs)
+
+
+    # TODO: temporary hack; need to get .owl versions of these eventually
+    onto_file_remappings = {
+        'mirror/icd10cm.owl': 'mirror/icd10cm.ttl',
+        'mirror/icd10who.owl': 'mirror/icd10who.ttl'
+    }
+    op = kwargs_dict['onto_path']
+    kwargs_dict['onto_path'] = onto_file_remappings.get(op, op)
+
+    run(**kwargs_dict)
+
+
+# Execution
+if __name__ == '__main__':
+    cli()
diff --git a/src/sparql/get-terms-children.sparql.jinja2 b/src/sparql/get-terms-children.sparql.jinja2
@@ -0,0 +1,9 @@
+prefix ICD10CM: <http://purl.bioontology.org/ontology/ICD10CM/>
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix skos: <http://www.w3.org/2004/02/skos/core#>
+
+select ?term_id ?child_id ?child_label where {
+  VALUES ?term_id { {{values}} }
+  ?child_id rdfs:subClassOf{{property_path_qualifier}} ?term_id ;
+    skos:prefLabel ?child_label .
+}
diff --git a/src/sparql/get-terms.sparql.jinja2 b/src/sparql/get-terms.sparql.jinja2
@@ -0,0 +1,10 @@
+prefix ICD10CM: <http://purl.bioontology.org/ontology/ICD10CM/>
+prefix owl:  <http://www.w3.org/2002/07/owl#>
+prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
+prefix skos: <http://www.w3.org/2004/02/skos/core#>
+
+select ?term_id ?term_label where {
+  VALUES ?term_id { {{values}} }
+  ?term_id a owl:Class;
+    skos:prefLabel ?term_label .
+}