Skip to content

Commit

Permalink
Feature: Exclusion tables
Browse files Browse the repository at this point in the history
- Updates to existing tables: Added header, including missing columns, but did not yet populate data in those columns.
- Added new tables for the rest of the ontologies.
- Added Python script to work w/ these tables: term_range_expansion.py (WIP)
- Added SPARQL scripts to be called by Python script.
- Added make goal to call Python script.
  • Loading branch information
joeflack4 committed Jun 22, 2022
1 parent 1d862bc commit d96a4d2
Show file tree
Hide file tree
Showing 11 changed files with 1,462 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/ontology/config/doid_exclusions.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
term_id term_label exclusion_reason exclude_children
DOID:0040001 shrimp allergy
DOID:0040002 aspirin allergy
DOID:0040003 benzylpenicillin allergy
Expand Down
1,226 changes: 1,226 additions & 0 deletions src/ontology/config/icd10cm_exclusions.tsv

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions src/ontology/config/icd10who_exclusions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
term_id term_label exclusion_reason exclude_children
1 change: 1 addition & 0 deletions src/ontology/config/ncit_exclusions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
term_id term_label exclusion_reason exclude_children
1 change: 1 addition & 0 deletions src/ontology/config/omim_exclusions.tsv
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
term_id term_label exclusion_reason exclude_children
OMIM:108340 aryl hydrocarbon hydroxylase inducibility
OMIM:113721 breast cancer-related regulator of tp53
OMIM:123270 creatine kinase, brain type, ectopic expression of
Expand Down
1 change: 1 addition & 0 deletions src/ontology/config/ordo_exclusions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
term_id term_label exclusion_reason exclude_children
1 change: 1 addition & 0 deletions src/ontology/config/snomed_exclusions.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
term_id term_label exclusion_reason exclude_children
6 changes: 4 additions & 2 deletions src/ontology/mondo-ingest.Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -160,15 +160,17 @@ mappings: sssom $(ALL_MAPPINGS)
#################
# Utils #########
#################
# Documentation for this commands in this section is in: `docs/developer/ordo.md`

# Documentation for `report-mapping-annotations` and `update-jinja-sparql-queries`: `docs/developer/ordo.md`
report-mapping-annotations:
python3 $(SCRIPTSDIR)/ordo_mapping_annotations/report_mapping_annotations.py

update-jinja-sparql-queries:
python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_replace_annotation_based_mappings.py
python3 $(SCRIPTSDIR)/ordo_mapping_annotations/create_sparql__ordo_mapping_annotations_violation.py

config/%_term_exclusions.txt: config/%_term_exclusions.tsv mirror/%.owl
python3 ../scripts/term_range_expansion.py --exclusions-path $(word 1,$^) --onto-path $(word 2,$^)

#################
# Documentation #
#################
Expand Down
207 changes: 207 additions & 0 deletions src/scripts/term_range_expansion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
"""Takes in the full ontology and the exclusions tsv and extracts a simple list of terms from it.
Outputs two files:
1. a simple list of terms (config/ONTO_NAME_term_exclusions.txt)
2. a simple two column file with the term_id and the exclusion reason
# Resources
- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/22
TODO's
- repurpose this: (i) to be generalized, (ii) to do the job of consuming and producing exclusion table as described in issue
- possible bug: ICD10CM:C7A-C7A reported as term, but should be ICD10CM:C7A
"""
import os
import subprocess
from argparse import ArgumentParser
from copy import copy
from typing import Any, Dict, List

from jinja2 import Template
import pandas as pd


# Vars
# # Config
USE_CACHE = False

# # Static
PROJECT_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')
SPARQL_DIR = os.path.join(PROJECT_DIR, 'sparql')
CACHE_DIR = os.path.join(PROJECT_DIR, 'cache')
sparql_child_inclusion_path = os.path.join(PROJECT_DIR, 'sparql', 'get-terms-children.sparql.jinja2')
sparql_child_exclusion_path = os.path.join(PROJECT_DIR, 'sparql', 'get-terms.sparql.jinja2')
input_exclusion_term_ranges_csv_path = os.path.join(
PROJECT_DIR, 'preparation', 'icd10cm_excluded_term_ranges_intensional.csv')
input_icd10cm_ttl_path = os.path.join(PROJECT_DIR, 'preparation', 'icd10cm.ttl')
TERM_RANGE_MAX_VAL = 99


# Functions
# todo: pd.errors.EmptyDataError: I think this happens because Python process needs to stop before results saved?
def sparql_jinja2_file_query__via_robot(
terms: List[str], query_template_path: str,
path_in_data_dir=os.path.basename(input_icd10cm_ttl_path), onto_path=input_icd10cm_ttl_path,
use_cache=USE_CACHE, include_superclass=False
) -> pd.DataFrame:
"""Query ontology using SPARQL query file"""
# Basic vars
query_template_filename = os.path.basename(query_template_path)
results_dirname = path_in_data_dir.replace('/', '-').replace('.', '-') + \
'__' + query_template_filename.replace('.', '-')
results_dirpath = os.path.join(CACHE_DIR, 'robot', results_dirname)
results_filename = 'results.csv'
command_save_filename = 'command.sh'
results_path = os.path.join(results_dirpath, results_filename)
command_save_path = os.path.join(results_dirpath, command_save_filename)
instantiated_query_path = os.path.join(results_dirpath, 'query.sparql')
command_str = f'robot query --input {onto_path} --query {instantiated_query_path} {results_path}'

# Instantiate template
with open(query_template_path, 'r') as f:
template_str = f.read()
template_obj = Template(template_str)
# https://www.w3.org/TR/sparql11-query/#propertypaths
instantiated_str = template_obj.render(
property_path_qualifier='*' if include_superclass else '+',
values=' '.join(terms))

# Cache and run
os.makedirs(results_dirpath, exist_ok=True)
if not (os.path.exists(results_path) and use_cache):
with open(instantiated_query_path, 'w') as f:
f.write(instantiated_str)
with open(command_save_path, 'w') as f:
f.write(command_str)
subprocess.run(command_str.split())

# Read results and return
try:
df = pd.read_csv(results_path).fillna('')
except pd.errors.EmptyDataError as err:
os.remove(results_path) # done so that it doesn't read this from cache
raise err

return df


def run(onto_path: str, exclusions_path: str) -> pd.DataFrame:
"""Run"""
# TODO
# 1. query the exact terms listed in the exclusion table
# 2. regex
# Lastly the table should support the following trick: If a value in term_label starts with REGEX:, the script
# should search for all labels in the ontology and exclude them. For example:
# example table:
# term_id term_label exclusion_reason exclude_children
# | REGEX:^.*phenotype$ | phenotype | yes
# Should match all labels then end in the word phenotype and then exclude all their children.

# TODO: Need output TSV:
# - Fields: `Term ID`, `Exclusion Reason`
# - First row only: `ID`, `AI rdfs:seeAlso`
# Prepare
df = pd.DataFrame() # todo

df_included_kids = df[df['exclude_children'] == False]
df_excluded_kids = df[df['exclude_children'] == True]
# Remove duplicates using set()
terms_included_kids: List[str] = list(set(df_included_kids['term_id']))
terms_excluded_kids: List[str] = list(set(df_excluded_kids['term_id']))

# Query
# TODO: see if these jinja files are necessary. also, do they capture direct kids or all kids?
results_df_excluded_kids: pd.DataFrame = sparql_jinja2_file_query__via_robot(
terms_excluded_kids, sparql_child_exclusion_path)
results_df_included_kids: pd.DataFrame = sparql_jinja2_file_query__via_robot(
terms_included_kids, sparql_child_inclusion_path)

# Massage
# # Convert URI back to prefix
results_df_excluded_kids['term_id'] = results_df_excluded_kids['term_id'].apply(
lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:'))
results_df_included_kids['term_id'] = results_df_included_kids['term_id'].apply(
lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:'))
results_df_included_kids['child_id'] = results_df_included_kids['child_id'].apply(
lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:'))
# # Capture source information
# TODO attempt 2
xxx = pd.concat([results_df_included_kids, results_df_excluded_kids]).fillna('')
xxx['term_info_source'] = 'icd10cm.ttl'
# TODO: fix x/y (*Can I re-use `left_join_update()` from valueSetTools?)
# df['term_info_source'] = ''
# results_df_excluded_kids['term_info_source'] = 'icd10cm.ttl'
# results_df_included_kids['term_info_source'] = 'icd10cm.ttl'
# # Determine which factors were not in source
term_in_icd10cm_ttl = set(xxx['term_id'])
df['term_in_source_icd10cm.ttl'] = df['term_id'].apply(
lambda term_id: term_id in term_in_icd10cm_ttl)

# Join
# df2 = pd.merge(df, results_df_excluded_kids, how='left', on='term_id')
df3 = pd.merge(df, xxx, how='left', on='term_id').fillna('')
print(list(df3.columns)) # TODO temp

# Restructure
df4 = df3
df4['PK_unique_path'] = \
df4.apply(lambda row: row['term_range_id'].replace('ICD10CM:', '') + '_' + row['term_id'].replace(
'ICD10CM:', '') + '_' + row['child_id'].replace('ICD10CM:', ''), axis=1)
df4['PK_unique_path'] = df4['PK_unique_path'].apply(lambda x: x[:len(x) - 1] if x.endswith('_') else x)
df4 = df4.rename(columns={
'exclusion_reason': 'child_exclude_reason',
'exclude_children': 'child_exclude'})
df4 = df4[[
'PK_unique_path', 'term_range_id', 'term_range_label', 'term_id', 'term_label', 'child_id',
'child_label', 'term_info_source', 'term_in_source_icd10cm.ttl', 'child_exclude', 'child_exclude_reason']]
df4 = df4.sort_values(['PK_unique_path', 'term_range_id', 'term_id', 'child_id'])

# todo: change path
df4.to_csv('~/Desktop/results.csv', index=False)

return df4

return results


def get_parser() -> ArgumentParser:
"""Add required fields to parser."""
package_description = \
'Takes in a full ontology and an exclusions TSV and extracts a simple list of terms.\n'\
'Outputs two files:\n'\
'1. a simple list of terms (config/ONTO_NAME_term_exclusions.txt)\n'\
'2. a simple two column file with the term_id and the exclusion reason\n'
parser = ArgumentParser(description=package_description)

parser.add_argument(
'-o', '--onto-path',
help='Path to the ontology file to query.')

parser.add_argument(
'-e', '--exclusions-path',
help='Path to a TSV which should have the following fields: `term_id` (str), `term_label` (str), '
'`exclusion_reason` (str), and `exclude_children` (bool).')

return parser


def cli():
"""Command line interface."""
parser = get_parser()
kwargs = parser.parse_args()
kwargs_dict: Dict = vars(kwargs)


# TODO: temporary hack; need to get .owl versions of these eventually
onto_file_remappings = {
'mirror/icd10cm.owl': 'mirror/icd10cm.ttl',
'mirror/icd10who.owl': 'mirror/icd10who.ttl'
}
op = kwargs_dict['onto_path']
kwargs_dict['onto_path'] = onto_file_remappings.get(op, op)

run(**kwargs_dict)


# Execution
if __name__ == '__main__':
cli()
9 changes: 9 additions & 0 deletions src/sparql/get-terms-children.sparql.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
prefix ICD10CM: <http://purl.bioontology.org/ontology/ICD10CM/>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>

select ?term_id ?child_id ?child_label where {
VALUES ?term_id { {{values}} }
?child_id rdfs:subClassOf{{property_path_qualifier}} ?term_id ;
skos:prefLabel ?child_label .
}
10 changes: 10 additions & 0 deletions src/sparql/get-terms.sparql.jinja2
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
prefix ICD10CM: <http://purl.bioontology.org/ontology/ICD10CM/>
prefix owl: <http://www.w3.org/2002/07/owl#>
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
prefix skos: <http://www.w3.org/2004/02/skos/core#>

select ?term_id ?term_label where {
VALUES ?term_id { {{values}} }
?term_id a owl:Class;
skos:prefLabel ?term_label .
}

0 comments on commit d96a4d2

Please sign in to comment.