-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Updates to existing tables: Added header, including missing columns, but did not yet populate data in those columns. - Added new tables for the rest of the ontologies. - Added Python script to work w/ these tables: term_range_expansion.py (WIP) - Added SPARQL scripts to be called by Python script. - Added make goal to call Python script.
- Loading branch information
Showing
11 changed files
with
1,462 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
term_id term_label exclusion_reason exclude_children |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
term_id term_label exclusion_reason exclude_children |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
term_id term_label exclusion_reason exclude_children |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
term_id term_label exclusion_reason exclude_children |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
"""Takes in the full ontology and the exclusions tsv and extracts a simple list of terms from it. | ||
Outputs two files: | ||
1. a simple list of terms (config/ONTO_NAME_term_exclusions.txt) | ||
2. a simple two column file with the term_id and the exclusion reason | ||
# Resources | ||
- GitHub issue: https://github.com/monarch-initiative/mondo-ingest/issues/22 | ||
TODO's | ||
- repurpose this: (i) to be generalized, (ii) to do the job of consuming and producing exclusion table as described in issue | ||
- possible bug: ICD10CM:C7A-C7A reported as term, but should be ICD10CM:C7A | ||
""" | ||
import os | ||
import subprocess | ||
from argparse import ArgumentParser | ||
from copy import copy | ||
from typing import Any, Dict, List | ||
|
||
from jinja2 import Template | ||
import pandas as pd | ||
|
||
|
||
# Vars | ||
# # Config | ||
USE_CACHE = False | ||
|
||
# # Static | ||
PROJECT_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') | ||
SPARQL_DIR = os.path.join(PROJECT_DIR, 'sparql') | ||
CACHE_DIR = os.path.join(PROJECT_DIR, 'cache') | ||
sparql_child_inclusion_path = os.path.join(PROJECT_DIR, 'sparql', 'get-terms-children.sparql.jinja2') | ||
sparql_child_exclusion_path = os.path.join(PROJECT_DIR, 'sparql', 'get-terms.sparql.jinja2') | ||
input_exclusion_term_ranges_csv_path = os.path.join( | ||
PROJECT_DIR, 'preparation', 'icd10cm_excluded_term_ranges_intensional.csv') | ||
input_icd10cm_ttl_path = os.path.join(PROJECT_DIR, 'preparation', 'icd10cm.ttl') | ||
TERM_RANGE_MAX_VAL = 99 | ||
|
||
|
||
# Functions | ||
# todo: pd.errors.EmptyDataError: I think this happens because Python process needs to stop before results saved? | ||
def sparql_jinja2_file_query__via_robot( | ||
terms: List[str], query_template_path: str, | ||
path_in_data_dir=os.path.basename(input_icd10cm_ttl_path), onto_path=input_icd10cm_ttl_path, | ||
use_cache=USE_CACHE, include_superclass=False | ||
) -> pd.DataFrame: | ||
"""Query ontology using SPARQL query file""" | ||
# Basic vars | ||
query_template_filename = os.path.basename(query_template_path) | ||
results_dirname = path_in_data_dir.replace('/', '-').replace('.', '-') + \ | ||
'__' + query_template_filename.replace('.', '-') | ||
results_dirpath = os.path.join(CACHE_DIR, 'robot', results_dirname) | ||
results_filename = 'results.csv' | ||
command_save_filename = 'command.sh' | ||
results_path = os.path.join(results_dirpath, results_filename) | ||
command_save_path = os.path.join(results_dirpath, command_save_filename) | ||
instantiated_query_path = os.path.join(results_dirpath, 'query.sparql') | ||
command_str = f'robot query --input {onto_path} --query {instantiated_query_path} {results_path}' | ||
|
||
# Instantiate template | ||
with open(query_template_path, 'r') as f: | ||
template_str = f.read() | ||
template_obj = Template(template_str) | ||
# https://www.w3.org/TR/sparql11-query/#propertypaths | ||
instantiated_str = template_obj.render( | ||
property_path_qualifier='*' if include_superclass else '+', | ||
values=' '.join(terms)) | ||
|
||
# Cache and run | ||
os.makedirs(results_dirpath, exist_ok=True) | ||
if not (os.path.exists(results_path) and use_cache): | ||
with open(instantiated_query_path, 'w') as f: | ||
f.write(instantiated_str) | ||
with open(command_save_path, 'w') as f: | ||
f.write(command_str) | ||
subprocess.run(command_str.split()) | ||
|
||
# Read results and return | ||
try: | ||
df = pd.read_csv(results_path).fillna('') | ||
except pd.errors.EmptyDataError as err: | ||
os.remove(results_path) # done so that it doesn't read this from cache | ||
raise err | ||
|
||
return df | ||
|
||
|
||
def run(onto_path: str, exclusions_path: str) -> pd.DataFrame: | ||
"""Run""" | ||
# TODO | ||
# 1. query the exact terms listed in the exclusion table | ||
# 2. regex | ||
# Lastly the table should support the following trick: If a value in term_label starts with REGEX:, the script | ||
# should search for all labels in the ontology and exclude them. For example: | ||
# example table: | ||
# term_id term_label exclusion_reason exclude_children | ||
# | REGEX:^.*phenotype$ | phenotype | yes | ||
# Should match all labels then end in the word phenotype and then exclude all their children. | ||
|
||
# TODO: Need output TSV: | ||
# - Fields: `Term ID`, `Exclusion Reason` | ||
# - First row only: `ID`, `AI rdfs:seeAlso` | ||
# Prepare | ||
df = pd.DataFrame() # todo | ||
|
||
df_included_kids = df[df['exclude_children'] == False] | ||
df_excluded_kids = df[df['exclude_children'] == True] | ||
# Remove duplicates using set() | ||
terms_included_kids: List[str] = list(set(df_included_kids['term_id'])) | ||
terms_excluded_kids: List[str] = list(set(df_excluded_kids['term_id'])) | ||
|
||
# Query | ||
# TODO: see if these jinja files are necessary. also, do they capture direct kids or all kids? | ||
results_df_excluded_kids: pd.DataFrame = sparql_jinja2_file_query__via_robot( | ||
terms_excluded_kids, sparql_child_exclusion_path) | ||
results_df_included_kids: pd.DataFrame = sparql_jinja2_file_query__via_robot( | ||
terms_included_kids, sparql_child_inclusion_path) | ||
|
||
# Massage | ||
# # Convert URI back to prefix | ||
results_df_excluded_kids['term_id'] = results_df_excluded_kids['term_id'].apply( | ||
lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:')) | ||
results_df_included_kids['term_id'] = results_df_included_kids['term_id'].apply( | ||
lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:')) | ||
results_df_included_kids['child_id'] = results_df_included_kids['child_id'].apply( | ||
lambda x: x.replace('http://purl.bioontology.org/ontology/ICD10CM/', 'ICD10CM:')) | ||
# # Capture source information | ||
# TODO attempt 2 | ||
xxx = pd.concat([results_df_included_kids, results_df_excluded_kids]).fillna('') | ||
xxx['term_info_source'] = 'icd10cm.ttl' | ||
# TODO: fix x/y (*Can I re-use `left_join_update()` from valueSetTools?) | ||
# df['term_info_source'] = '' | ||
# results_df_excluded_kids['term_info_source'] = 'icd10cm.ttl' | ||
# results_df_included_kids['term_info_source'] = 'icd10cm.ttl' | ||
# # Determine which factors were not in source | ||
term_in_icd10cm_ttl = set(xxx['term_id']) | ||
df['term_in_source_icd10cm.ttl'] = df['term_id'].apply( | ||
lambda term_id: term_id in term_in_icd10cm_ttl) | ||
|
||
# Join | ||
# df2 = pd.merge(df, results_df_excluded_kids, how='left', on='term_id') | ||
df3 = pd.merge(df, xxx, how='left', on='term_id').fillna('') | ||
print(list(df3.columns)) # TODO temp | ||
|
||
# Restructure | ||
df4 = df3 | ||
df4['PK_unique_path'] = \ | ||
df4.apply(lambda row: row['term_range_id'].replace('ICD10CM:', '') + '_' + row['term_id'].replace( | ||
'ICD10CM:', '') + '_' + row['child_id'].replace('ICD10CM:', ''), axis=1) | ||
df4['PK_unique_path'] = df4['PK_unique_path'].apply(lambda x: x[:len(x) - 1] if x.endswith('_') else x) | ||
df4 = df4.rename(columns={ | ||
'exclusion_reason': 'child_exclude_reason', | ||
'exclude_children': 'child_exclude'}) | ||
df4 = df4[[ | ||
'PK_unique_path', 'term_range_id', 'term_range_label', 'term_id', 'term_label', 'child_id', | ||
'child_label', 'term_info_source', 'term_in_source_icd10cm.ttl', 'child_exclude', 'child_exclude_reason']] | ||
df4 = df4.sort_values(['PK_unique_path', 'term_range_id', 'term_id', 'child_id']) | ||
|
||
# todo: change path | ||
df4.to_csv('~/Desktop/results.csv', index=False) | ||
|
||
return df4 | ||
|
||
return results | ||
|
||
|
||
def get_parser() -> ArgumentParser: | ||
"""Add required fields to parser.""" | ||
package_description = \ | ||
'Takes in a full ontology and an exclusions TSV and extracts a simple list of terms.\n'\ | ||
'Outputs two files:\n'\ | ||
'1. a simple list of terms (config/ONTO_NAME_term_exclusions.txt)\n'\ | ||
'2. a simple two column file with the term_id and the exclusion reason\n' | ||
parser = ArgumentParser(description=package_description) | ||
|
||
parser.add_argument( | ||
'-o', '--onto-path', | ||
help='Path to the ontology file to query.') | ||
|
||
parser.add_argument( | ||
'-e', '--exclusions-path', | ||
help='Path to a TSV which should have the following fields: `term_id` (str), `term_label` (str), ' | ||
'`exclusion_reason` (str), and `exclude_children` (bool).') | ||
|
||
return parser | ||
|
||
|
||
def cli(): | ||
"""Command line interface.""" | ||
parser = get_parser() | ||
kwargs = parser.parse_args() | ||
kwargs_dict: Dict = vars(kwargs) | ||
|
||
|
||
# TODO: temporary hack; need to get .owl versions of these eventually | ||
onto_file_remappings = { | ||
'mirror/icd10cm.owl': 'mirror/icd10cm.ttl', | ||
'mirror/icd10who.owl': 'mirror/icd10who.ttl' | ||
} | ||
op = kwargs_dict['onto_path'] | ||
kwargs_dict['onto_path'] = onto_file_remappings.get(op, op) | ||
|
||
run(**kwargs_dict) | ||
|
||
|
||
# Execution | ||
if __name__ == '__main__': | ||
cli() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
prefix ICD10CM: <http://purl.bioontology.org/ontology/ICD10CM/> | ||
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
prefix skos: <http://www.w3.org/2004/02/skos/core#> | ||
|
||
select ?term_id ?child_id ?child_label where { | ||
VALUES ?term_id { {{values}} } | ||
?child_id rdfs:subClassOf{{property_path_qualifier}} ?term_id ; | ||
skos:prefLabel ?child_label . | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
prefix ICD10CM: <http://purl.bioontology.org/ontology/ICD10CM/> | ||
prefix owl: <http://www.w3.org/2002/07/owl#> | ||
prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> | ||
prefix skos: <http://www.w3.org/2004/02/skos/core#> | ||
|
||
select ?term_id ?term_label where { | ||
VALUES ?term_id { {{values}} } | ||
?term_id a owl:Class; | ||
skos:prefLabel ?term_label . | ||
} |