From 2b6535052ca27cd1336d206681df3165791ae9a9 Mon Sep 17 00:00:00 2001 From: suhana13 Date: Tue, 3 Aug 2021 14:53:16 -0700 Subject: [PATCH 01/28] feat: add disease_ontology.tmcf --- .../diseaseOntology/disease_ontology.tmcf | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 scripts/biomedical/diseaseOntology/disease_ontology.tmcf diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf new file mode 100644 index 0000000000..0a22eda7dd --- /dev/null +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -0,0 +1,24 @@ +Node: E:DiseaseOntology->E1 +typeOf: dcs:Disease +dcid: C:DiseaseOntology->dcid +parent: C:DiseaseOntology->subClassOf +diseaseDescription: C:DiseaseOntology->description +alternativeDOIDs : C:DiseaseOntology->hasAlternativeId +diseaseSynonym: C:DiseaseOntology->hasExactSynonym +commonName: C:DiseaseOntology->label +icdoID: C:DiseaseOntology->ICDO +meshID: C:DiseaseOntology->MESH +nciID: C:DiseaseOntology->NCI +snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20200901 +snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20200301 +snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20180301 +snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20190901 +umlscuiID: C:DiseaseOntology->UMLSCUI +icd10CMID: C:DiseaseOntology->ICD10CM +icd9CMID: C:DiseaseOntology->ICD9CM +orDOID: C:DiseaseOntology->ORDO +gardID: C:DiseaseOntology->GARD +omimID: C:DiseaseOntology->OMIM +efoID: C:DiseaseOntology->EFO +keggDiseaseID: C:DiseaseOntology->KEGG +medDraID: C:DiseaseOntology->MEDDRA From e952017daac39ee65ac43ab9485662d292d7fea5 Mon Sep 17 00:00:00 2001 From: suhana13 Date: Tue, 3 Aug 2021 14:53:35 -0700 Subject: [PATCH 02/28] feat: add format_disease_ontology.py --- .../format_disease_ontology.py | 267 ++++++++++++++++++ 1 file changed, 267 insertions(+) create mode 100644 scripts/biomedical/diseaseOntology/format_disease_ontology.py diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py new file mode 100644 index 0000000000..480faeaee3 --- /dev/null +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -0,0 +1,267 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Author: Suhana Bedi +Date: 08/03/2021 +Name: format_disease_ontology +Description: converts a .owl disease ontology file into +a csv format, creates dcids for each disease and links the dcids +of current MeSH and ICD10 codes to the corresponding properties in +the dataset. +@file_input: input .owl from Human DO database +@file_output: formatted .csv with disease ontology +""" + +from xml.etree import ElementTree +from collections import defaultdict +import pandas as pd +import re +import numpy as np +import datacommons as dc +import sys + + +def format_tag(tag: str) -> str: + """Extract human-readable tag from xml tag + Args: + tag: tag of an element in xml file, + containg human-readable string after '}' + Returns: + tag_readable: human-readble string after '}' + + """ + tag_readable = tag.split("}")[1] + return tag_readable + + +def format_attrib(attrib: dict) -> str: + """Extract text from xml attributes dictionary + Args: + attrib: attribute of an xml element + Returns: + text: extracted text from attribute values, + either after '#' or after the final '/' + if '#' does not exist + """ + attrib = list(attrib.values())[0] + text = None + if "#" in attrib: + text = attrib.split("#")[-1] + else: + text = attrib.split("/")[-1] + return text + + +def parse_do_info(info: list) -> dict: + """Parse owl class childrens + to human-readble dictionary + Args: + info: list of owl class children + Returns: + info_dict: human_readable dictionary + containing information of owl class children + """ + info_dict = defaultdict(list) + for element in info: + tag = format_tag(element.tag) + if element.text == None: + text = format_attrib(element.attrib) + info_dict[tag].append(text) + else: + info_dict[tag].append(element.text) + return info_dict + + +def format_cols(df): + """ + Converts all columns to string type and + replaces all special characters + Args: + df = dataframe to change + Returns: + none + """ + for i, col in enumerate(df.columns): + df[col] = df[col].astype(str) + df[col] = df[col].map(lambda x: re.sub(r'[\([{})\]]', '', x)) + df.iloc[:, i] = df.iloc[:, i].str.replace("'", '') + df.iloc[:, i] = df.iloc[:, i].str.replace('"', '') + df[col] = df[col].replace('nan', np.nan) + df['id'] = df['id'].str.replace(':', '_') + + +def col_explode(df): + """ + Splits the hasDbXref column into multiple columns + based on the prefix identifying the database from which + the ID originates + Args: + df = dataframe to change + Returns + df = modified dataframe + """ + df = df.assign(hasDbXref=df.hasDbXref.str.split(",")).explode('hasDbXref') + df[['A', 'B']] = df['hasDbXref'].str.split(':', 1, expand=True) + df['A'] = df['A'].astype(str).map(lambda x: re.sub('[^A-Za-z0-9]+', '', x)) + col_add = list(df['A'].unique()) + for newcol in col_add: + df[newcol] = np.nan + df[newcol] = np.where(df['A'] == newcol, df['B'], np.nan) + df[newcol] = df[newcol].astype(str).replace("nan", np.nan) + return df + + +def shard(list_to_shard, shard_size): + """ + Breaks down a list into smaller + sublists, converts it into an array + and appends the array to the master + list + Args: + list_to_shard = original list + shard_size = size of subist + Returns: + sharded_list = master list with + smaller sublists + """ + sharded_list = [] + for i in range(0, len(list_to_shard), shard_size): + shard = list_to_shard[i:i + shard_size] + arr = np.array(shard) + sharded_list.append(arr) + return sharded_list + + +def col_string(df): + """ + Adds string quotes to columns in a dataframe + Args: + df = dataframe whose columns are modified + Returns: + None + """ + col_add = list(df['A'].unique()) + for newcol in col_add: + df[newcol] = str(newcol) + ":" + df[newcol].astype(str) + col_rep = str(newcol) + ":" + "nan" + df[newcol] = df[newcol].replace(col_rep, np.nan) + col_names = [ + 'hasAlternativeId', 'IAO_0000115', 'hasExactSynonym', 'label', 'ICDO', + 'MESH', 'NCI', 'SNOMEDCTUS20200901', 'UMLSCUI', 'ICD10CM', 'ICD9CM', + 'SNOMEDCTUS20200301', 'ORDO', 'SNOMEDCTUS20180301', 'GARD', 'OMIM', + 'EFO', 'KEGG', 'MEDDRA', 'SNOMEDCTUS20190901' + ] + for col in col_names: + df.update('"' + df[[col]].astype(str) + '"') + + +def mesh_query(df): + """ + Queries the MESH ids present in the dataframe, + on datacommons, fetches their dcids and adds + it to the same column. + Args: + df = dataframe to change + Returns + df = modified dataframe with MESH dcid added + """ + df_temp = df[df.MESH.notnull()] + list_mesh = list(df_temp['MESH']) + arr_mesh = shard(list_mesh, 1000) + for i in range(len(arr_mesh)): + query_str = """ + SELECT DISTINCT ?id ?element_name + WHERE {{ + ?element typeOf MeSHDescriptor . + ?element dcid ?id . + ?element name ?element_name . + ?element name {0} . + }} + """.format(arr_mesh[i]) + result = dc.query(query_str) + result_df = pd.DataFrame(result) + result_df.columns = ['id', 'element_name'] + df.MESH.update(df.MESH.map(result_df.set_index('element_name').id)) + return df + + +def icd10_query(df): + """ + Queries the ICD10 ids present in the dataframe, + on datacommons, fetches their dcids and adds + it to the same column. + Args: + df = dataframe to change + Returns + df = modified dataframe with ICD dcid added + """ + df_temp = df[df.ICD10CM.notnull()] + list_icd10 = "ICD10/" + df_temp['ICD10CM'].astype(str) + arr_icd10 = shard(list_icd10, 1000) + for i in range(len(arr_icd10)): + query_str = """ + SELECT DISTINCT ?id + WHERE {{ + ?element typeOf ICD10Code . + ?element dcid ?id . + ?element dcid {0} . + }} + """.format(arr_icd10[i]) + result1 = dc.query(query_str) + result1_df = pd.DataFrame(result1) + result1_df['element'] = result1_df['?id'].str.split(pat="/").str[1] + result1_df.columns = ['id', 'element'] + df.ICD10CM.update(df.ICD10CM.map(result1_df.set_index('element').id)) + return df + + +def main(): + file_input = sys.argv[1] + file_output = sys.argv[2] + # Read disease ontology .owl file + tree = ElementTree.parse(file_input) + # Get file root + root = tree.getroot() + # Find owl classes elements + all_classes = root.findall('{http://www.w3.org/2002/07/owl#}Class') + # Parse owl classes to human-readble dictionary format + parsed_owl_classes = [] + for owl_class in all_classes: + info = list(owl_class.getiterator()) + parsed_owl_classes.append(parse_do_info(info)) + # Convert to pandas Dataframe + df_do = pd.DataFrame(parsed_owl_classes) + format_cols(df_do) + df_do = df_do.drop([ + 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', + 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', + 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', + 'inSubset', 'hasOBONamespace' + ], + axis=1) + df_do = col_explode(df_do) + df_do = mesh_query(df_do) + df_do = icd10_query(df_do) + col_string(df_do) + df_do = df_do.drop(['A', 'B', 'nan', 'hasDbXref', 'KEGG'], axis=1) + df_do = df_do.drop_duplicates(subset='id', keep="last") + df_do = df_do.reset_index(drop=True) + df_do = df_do.replace('"nan"', np.nan) + #generate dcids + df_do['id'] = "bio/DOID_" + df_do['id'] + df_do.to_csv(file_output) + + +if __name__ == '__main__': + main() From bc28ccac02181a10c3f0d77a0b07ad31e346f3f7 Mon Sep 17 00:00:00 2001 From: suhana13 Date: Tue, 3 Aug 2021 14:53:49 -0700 Subject: [PATCH 03/28] feat: add README --- scripts/biomedical/diseaseOntology/README.md | 50 ++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 scripts/biomedical/diseaseOntology/README.md diff --git a/scripts/biomedical/diseaseOntology/README.md b/scripts/biomedical/diseaseOntology/README.md new file mode 100644 index 0000000000..ef90167e14 --- /dev/null +++ b/scripts/biomedical/diseaseOntology/README.md @@ -0,0 +1,50 @@ +# Importing the Disease Ontology (DO) data + +## Table of Contents + +- [Importing the Disease Ontology (DO) data](#importing-the-disease-ontology-do-data) + - [Table of Contents](#table-of-contents) + - [About the Dataset](#about-the-dataset) + - [About the import](#about-the-import) + +## About the Dataset + +- ### Download URL + +The human disease ontology data can be downloaded from their official github repository [here](https://www.vmh.life/#human/all). The data is in `.owl` format and had to be parsed into a `.csv` format (see [Notes and Caveats](#notes-and-caveats) for additional information on formatting). + +- ### Overview + +This directory stores the script used to convert the dataset obtained from DO into a modified version, for effective ingestion of data into the Data Commons knowledge graph. + +The Disease Ontology database provides a standardized ontology for human diseases, for the purposes of consistency and reusability. It has contains extensive cross mapping of DO terms to other databases, namely, MeSH, ICD, NCI’s thesaurus, SNOMED and OMIM. More information on the database can be found [here](https://disease-ontology.org). + +- ### Schema Overview + +The schema representing reaction, metabolite and microbiome data from VMH is defined in [DO.mcf](https://raw.githubusercontent.com/suhana13/ISB-project/main/combined_list.mcf) and [DO.mcf](https://raw.githubusercontent.com/suhana13/ISB-project/main/combined_list_enum.mcf). + +This dataset contains several instances of the class `Disease` and it has multiple properties namely, "parent", "diseaseDescription", "alternativeDOIDs", "diseaseSynonym", "commonName", "icdoID", "meshID", "nciID", "snowmedctusID", "umlscuiID", "icd10CMID", "icd9CMID", "orDOID", "gardID", "omimID", "efoID", "keggDiseaseID", and "medDraID" + +- ### Notes and Caveats + +The data was present in a `.owl` format. So, it had to be carefully parsed into a `.csv` format for its easy ingestion in the data commons knowledge graph. The parsing might have added a little to the total runtime of the data processing python script. + +- ### License + +This data is under a Creative Commons Public Domain Dedication [CC0 1.0 Universal license](https://disease-ontology.org/resources/do-resources). + +## About the import + +- ### Artifacts + +- #### Scripts + +`format_disease_ontology.py` + +- ## Examples + +To generate the formatted csv file from owl: + +``` +python format_disease_ontology.py humanDO.owl humanDO.csv +``` From f8efa54ff00d6f6366684367c47be4d4b50dac38 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Thu, 5 Aug 2021 17:07:25 -0700 Subject: [PATCH 04/28] Update README.md --- scripts/biomedical/diseaseOntology/README.md | 22 ++++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/README.md b/scripts/biomedical/diseaseOntology/README.md index ef90167e14..3b0219a40c 100644 --- a/scripts/biomedical/diseaseOntology/README.md +++ b/scripts/biomedical/diseaseOntology/README.md @@ -5,29 +5,29 @@ - [Importing the Disease Ontology (DO) data](#importing-the-disease-ontology-do-data) - [Table of Contents](#table-of-contents) - [About the Dataset](#about-the-dataset) - - [About the import](#about-the-import) + - [About the Import](#about-the-import) ## About the Dataset -- ### Download URL +### Download URL The human disease ontology data can be downloaded from their official github repository [here](https://www.vmh.life/#human/all). The data is in `.owl` format and had to be parsed into a `.csv` format (see [Notes and Caveats](#notes-and-caveats) for additional information on formatting). -- ### Overview - -This directory stores the script used to convert the dataset obtained from DO into a modified version, for effective ingestion of data into the Data Commons knowledge graph. +### Overview The Disease Ontology database provides a standardized ontology for human diseases, for the purposes of consistency and reusability. It has contains extensive cross mapping of DO terms to other databases, namely, MeSH, ICD, NCI’s thesaurus, SNOMED and OMIM. More information on the database can be found [here](https://disease-ontology.org). -- ### Schema Overview +This directory stores the script used to convert the dataset obtained from DO into a modified version, for effective ingestion of data into the Data Commons knowledge graph. + +### Schema Overview The schema representing reaction, metabolite and microbiome data from VMH is defined in [DO.mcf](https://raw.githubusercontent.com/suhana13/ISB-project/main/combined_list.mcf) and [DO.mcf](https://raw.githubusercontent.com/suhana13/ISB-project/main/combined_list_enum.mcf). This dataset contains several instances of the class `Disease` and it has multiple properties namely, "parent", "diseaseDescription", "alternativeDOIDs", "diseaseSynonym", "commonName", "icdoID", "meshID", "nciID", "snowmedctusID", "umlscuiID", "icd10CMID", "icd9CMID", "orDOID", "gardID", "omimID", "efoID", "keggDiseaseID", and "medDraID" -- ### Notes and Caveats +### Notes and Caveats -The data was present in a `.owl` format. So, it had to be carefully parsed into a `.csv` format for its easy ingestion in the data commons knowledge graph. The parsing might have added a little to the total runtime of the data processing python script. +The original format of the data was `.owl` and it was converted to a `.csv` file prior to ingestion into Data Commons. - ### License @@ -35,13 +35,13 @@ This data is under a Creative Commons Public Domain Dedication [CC0 1.0 Universa ## About the import -- ### Artifacts +### Artifacts -- #### Scripts +#### Scripts `format_disease_ontology.py` -- ## Examples +## Examples To generate the formatted csv file from owl: From 9c12a2d6e88d5bffe9f4ecd5d57b7ff4f1ed1f47 Mon Sep 17 00:00:00 2001 From: suhana13 Date: Fri, 6 Aug 2021 07:15:59 -0700 Subject: [PATCH 05/28] feat: add helper function --- .../format_disease_ontology.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 480faeaee3..9e91320bba 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -225,15 +225,31 @@ def icd10_query(df): df.ICD10CM.update(df.ICD10CM.map(result1_df.set_index('element').id)) return df +def col_drop(df_do): + """ + Drops required columns + Args: + df_do = dataframe to change + Returns + df_do = modified dataframe + """ + df_do = df_do.drop([ + 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', + 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', + 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', + 'inSubset', 'hasOBONamespace' +], + axis=1) + return df_do + + def main(): file_input = sys.argv[1] file_output = sys.argv[2] # Read disease ontology .owl file tree = ElementTree.parse(file_input) - # Get file root root = tree.getroot() - # Find owl classes elements all_classes = root.findall('{http://www.w3.org/2002/07/owl#}Class') # Parse owl classes to human-readble dictionary format parsed_owl_classes = [] @@ -243,13 +259,7 @@ def main(): # Convert to pandas Dataframe df_do = pd.DataFrame(parsed_owl_classes) format_cols(df_do) - df_do = df_do.drop([ - 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', - 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', - 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', - 'inSubset', 'hasOBONamespace' - ], - axis=1) + df_do = col_drop(df_do) df_do = col_explode(df_do) df_do = mesh_query(df_do) df_do = icd10_query(df_do) From 8d4f7f26196c67e6268d4298b6e364bbeaea4d20 Mon Sep 17 00:00:00 2001 From: suhana13 Date: Fri, 6 Aug 2021 07:21:41 -0700 Subject: [PATCH 06/28] fix: nits --- scripts/biomedical/diseaseOntology/README.md | 11 +++++++++-- .../diseaseOntology/format_disease_ontology.py | 4 ++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/README.md b/scripts/biomedical/diseaseOntology/README.md index 3b0219a40c..8df20c6d07 100644 --- a/scripts/biomedical/diseaseOntology/README.md +++ b/scripts/biomedical/diseaseOntology/README.md @@ -5,7 +5,14 @@ - [Importing the Disease Ontology (DO) data](#importing-the-disease-ontology-do-data) - [Table of Contents](#table-of-contents) - [About the Dataset](#about-the-dataset) - - [About the Import](#about-the-import) + - [Download URL](#download-url) + - [Overview](#overview) + - [Schema Overview](#schema-overview) + - [Notes and Caveats](#notes-and-caveats) + - [About the import](#about-the-import) + - [Artifacts](#artifacts) + - [Scripts](#scripts) + - [Examples](#examples) ## About the Dataset @@ -15,7 +22,7 @@ The human disease ontology data can be downloaded from their official github rep ### Overview -The Disease Ontology database provides a standardized ontology for human diseases, for the purposes of consistency and reusability. It has contains extensive cross mapping of DO terms to other databases, namely, MeSH, ICD, NCI’s thesaurus, SNOMED and OMIM. More information on the database can be found [here](https://disease-ontology.org). +The Disease Ontology database provides a standardized ontology for human diseases, for the purposes of consistency and reusability. It contains extensive cross mapping of DO terms to other databases, namely, MeSH, ICD, NCI’s thesaurus, SNOMED and OMIM. More information on the database can be found [here](https://disease-ontology.org). This directory stores the script used to convert the dataset obtained from DO into a modified version, for effective ingestion of data into the Data Commons knowledge graph. diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 9e91320bba..99eac20bc3 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -38,7 +38,7 @@ def format_tag(tag: str) -> str: tag: tag of an element in xml file, containg human-readable string after '}' Returns: - tag_readable: human-readble string after '}' + tag_readable: human-readable string after '}' """ tag_readable = tag.split("}")[1] @@ -92,8 +92,8 @@ def format_cols(df): Returns: none """ + df = df.astype(str) for i, col in enumerate(df.columns): - df[col] = df[col].astype(str) df[col] = df[col].map(lambda x: re.sub(r'[\([{})\]]', '', x)) df.iloc[:, i] = df.iloc[:, i].str.replace("'", '') df.iloc[:, i] = df.iloc[:, i].str.replace('"', '') From 6a5cc0c63f1d73d665a613650861afd499b00009 Mon Sep 17 00:00:00 2001 From: suhana13 Date: Mon, 27 Sep 2021 12:26:30 -0500 Subject: [PATCH 07/28] fix: property in tmcf --- scripts/biomedical/diseaseOntology/disease_ontology.tmcf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf index 0a22eda7dd..c86accf1d1 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -2,7 +2,7 @@ Node: E:DiseaseOntology->E1 typeOf: dcs:Disease dcid: C:DiseaseOntology->dcid parent: C:DiseaseOntology->subClassOf -diseaseDescription: C:DiseaseOntology->description +diseaseDescription: C:DiseaseOntology->IAO_0000115 alternativeDOIDs : C:DiseaseOntology->hasAlternativeId diseaseSynonym: C:DiseaseOntology->hasExactSynonym commonName: C:DiseaseOntology->label @@ -20,5 +20,4 @@ orDOID: C:DiseaseOntology->ORDO gardID: C:DiseaseOntology->GARD omimID: C:DiseaseOntology->OMIM efoID: C:DiseaseOntology->EFO -keggDiseaseID: C:DiseaseOntology->KEGG medDraID: C:DiseaseOntology->MEDDRA From 2aef466732c252a753152fc732c6e93f16c7d00a Mon Sep 17 00:00:00 2001 From: suhana13 Date: Fri, 8 Oct 2021 14:01:50 -0500 Subject: [PATCH 08/28] feat: format cols --- .../format_disease_ontology.py | 56 +++++++++++-------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 99eac20bc3..f4be675a7f 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -38,7 +38,7 @@ def format_tag(tag: str) -> str: tag: tag of an element in xml file, containg human-readable string after '}' Returns: - tag_readable: human-readable string after '}' + tag_readable: human-readble string after '}' """ tag_readable = tag.split("}")[1] @@ -92,8 +92,8 @@ def format_cols(df): Returns: none """ - df = df.astype(str) for i, col in enumerate(df.columns): + df[col] = df[col].astype(str) df[col] = df[col].map(lambda x: re.sub(r'[\([{})\]]', '', x)) df.iloc[:, i] = df.iloc[:, i].str.replace("'", '') df.iloc[:, i] = df.iloc[:, i].str.replace('"', '') @@ -157,8 +157,8 @@ def col_string(df): col_rep = str(newcol) + ":" + "nan" df[newcol] = df[newcol].replace(col_rep, np.nan) col_names = [ - 'hasAlternativeId', 'IAO_0000115', 'hasExactSynonym', 'label', 'ICDO', - 'MESH', 'NCI', 'SNOMEDCTUS20200901', 'UMLSCUI', 'ICD10CM', 'ICD9CM', + 'hasAlternativeId', 'hasExactSynonym', 'label', 'ICDO', 'MESH', 'NCI', + 'SNOMEDCTUS20200901', 'UMLSCUI', 'ICD10CM', 'ICD9CM', 'SNOMEDCTUS20200301', 'ORDO', 'SNOMEDCTUS20180301', 'GARD', 'OMIM', 'EFO', 'KEGG', 'MEDDRA', 'SNOMEDCTUS20190901' ] @@ -225,31 +225,26 @@ def icd10_query(df): df.ICD10CM.update(df.ICD10CM.map(result1_df.set_index('element').id)) return df -def col_drop(df_do): - """ - Drops required columns - Args: - df_do = dataframe to change - Returns - df_do = modified dataframe - """ - df_do = df_do.drop([ - 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', - 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', - 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', - 'inSubset', 'hasOBONamespace' -], - axis=1) - return df_do +def remove_newline(df): + df.loc[2505, 'IAO_0000115'] = df.loc[2505, 'IAO_0000115'].replace("\\n", "") + df.loc[2860, 'IAO_0000115'] = df.loc[2860, 'IAO_0000115'].replace("\\n", "") + df.loc[2895, 'IAO_0000115'] = df.loc[2895, 'IAO_0000115'].replace("\\n", "") + df.loc[2934, 'IAO_0000115'] = df.loc[2934, 'IAO_0000115'].replace("\\n", "") + df.loc[3036, 'IAO_0000115'] = df.loc[3036, 'IAO_0000115'].replace("\\n", "") + df.loc[11305, 'IAO_0000115'] = df.loc[11305, + 'IAO_0000115'].replace("\\n", "") + return df -def main(): +def wrapper_fun(file_input, file_output): file_input = sys.argv[1] file_output = sys.argv[2] # Read disease ontology .owl file tree = ElementTree.parse(file_input) + # Get file root root = tree.getroot() + # Find owl classes elements all_classes = root.findall('{http://www.w3.org/2002/07/owl#}Class') # Parse owl classes to human-readble dictionary format parsed_owl_classes = [] @@ -259,7 +254,13 @@ def main(): # Convert to pandas Dataframe df_do = pd.DataFrame(parsed_owl_classes) format_cols(df_do) - df_do = col_drop(df_do) + df_do = df_do.drop([ + 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', + 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', + 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', + 'inSubset', 'hasOBONamespace' + ], + axis=1) df_do = col_explode(df_do) df_do = mesh_query(df_do) df_do = icd10_query(df_do) @@ -269,9 +270,18 @@ def main(): df_do = df_do.reset_index(drop=True) df_do = df_do.replace('"nan"', np.nan) #generate dcids - df_do['id'] = "bio/DOID_" + df_do['id'] + df_do['id'] = "bio/" + df_do['id'] + ##df_do.loc[2505, 'IAO_0000115'] = df_do.loc[2505, 'IAO_0000115'].replace("\\n", "") + df_do = remove_newline(df_do) + df_do['IAO_0000115'] = df_do['IAO_0000115'].str.replace("_", " ") df_do.to_csv(file_output) +def main(): + file_input = sys.argv[1] + file_output = sys.argv[2] + wrapper_fun(file_input, file_output) + + if __name__ == '__main__': main() From 4832d53df4a7a8ebf7707f9daea368f83306ba9a Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Fri, 8 Jul 2022 15:08:54 -0500 Subject: [PATCH 09/28] add unittests --- .../diseaseOntology/disease_ontology_test.py | 39 + .../format_disease_ontology.py | 56 +- .../diseaseOntology/unit-tests/test-do.xml | 1758 +++++++++++++++++ .../unit-tests/test-output.csv | 25 + 4 files changed, 1853 insertions(+), 25 deletions(-) create mode 100644 scripts/biomedical/diseaseOntology/disease_ontology_test.py create mode 100644 scripts/biomedical/diseaseOntology/unit-tests/test-do.xml create mode 100644 scripts/biomedical/diseaseOntology/unit-tests/test-output.csv diff --git a/scripts/biomedical/diseaseOntology/disease_ontology_test.py b/scripts/biomedical/diseaseOntology/disease_ontology_test.py new file mode 100644 index 0000000000..45bc4eb9d0 --- /dev/null +++ b/scripts/biomedical/diseaseOntology/disease_ontology_test.py @@ -0,0 +1,39 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Author: Suhana Bedi +Date: 07/08/2022 +Name: disease_ontology_test.py +Description: runs unit tests for format_disease_ontology.py +Run: python3 disease_ontology_test.py +''' + +import unittest +from pandas.testing import assert_frame_equal +from format_disease_ontology import * + +class TestParseMesh(unittest.TestCase): + """Test the functions in format_disease_ontology""" + + def test_main(self): + """Test in the main function""" + # Read in the expected output files into pandas dataframes + df1_expected = pd.read_csv('unit-tests/test-output.csv') + df_actual = wrapper_fun('unit-tests/test-do.xml') + # Run all the functions in format_mesh.py + # Compare expected and actual output files + assert_frame_equal(df1_expected.reset_index(drop=True), df_actual.reset_index(drop=True)) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index f4be675a7f..3e966bf0f5 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -29,6 +29,7 @@ import re import numpy as np import datacommons as dc +import csv import sys @@ -157,13 +158,17 @@ def col_string(df): col_rep = str(newcol) + ":" + "nan" df[newcol] = df[newcol].replace(col_rep, np.nan) col_names = [ - 'hasAlternativeId', 'hasExactSynonym', 'label', 'ICDO', 'MESH', 'NCI', - 'SNOMEDCTUS20200901', 'UMLSCUI', 'ICD10CM', 'ICD9CM', + 'hasAlternativeId', 'hasExactSynonym', 'label', 'ICDO', 'MESH', 'NCI', + 'SNOMEDCTUS20200901', 'ICD10CM', 'ICD9CM', 'SNOMEDCTUS20200301', 'ORDO', 'SNOMEDCTUS20180301', 'GARD', 'OMIM', 'EFO', 'KEGG', 'MEDDRA', 'SNOMEDCTUS20190901' ] for col in col_names: df.update('"' + df[[col]].astype(str) + '"') + df['UMLSCUI'] = df['UMLSCUI'].astype(str) + df['UMLSCUI'] = df['UMLSCUI'].str.split(':').str[-1] + df['ICD9CM'] = df['ICD9CM'].astype(str) + df['ICD9CM'] = df['ICD9CM'].str.split(':').str[-1] def mesh_query(df): @@ -185,14 +190,14 @@ def mesh_query(df): WHERE {{ ?element typeOf MeSHDescriptor . ?element dcid ?id . - ?element name ?element_name . - ?element name {0} . + ?element descriptorID ?element_name . + ?element descriptorID {0} . }} """.format(arr_mesh[i]) - result = dc.query(query_str) - result_df = pd.DataFrame(result) - result_df.columns = ['id', 'element_name'] - df.MESH.update(df.MESH.map(result_df.set_index('element_name').id)) + result = dc.query(query_str) + result_df = pd.DataFrame(result) + result_df.columns = ['MESH_id', 'element_name'] + df = pd.merge(result_df, df, left_on='element_name', right_on='MESH', how = "right") return df @@ -218,11 +223,11 @@ def icd10_query(df): ?element dcid {0} . }} """.format(arr_icd10[i]) - result1 = dc.query(query_str) - result1_df = pd.DataFrame(result1) - result1_df['element'] = result1_df['?id'].str.split(pat="/").str[1] - result1_df.columns = ['id', 'element'] - df.ICD10CM.update(df.ICD10CM.map(result1_df.set_index('element').id)) + result1 = dc.query(query_str) + result1_df = pd.DataFrame(result1) + result1_df['element'] = result1_df['?id'].str.split(pat="/").str[1] + result1_df.columns = ['ICD10_id', 'element'] + df = pd.merge(result1_df, df, left_on='element', right_on='ICD10CM', how = "right") return df @@ -232,14 +237,10 @@ def remove_newline(df): df.loc[2895, 'IAO_0000115'] = df.loc[2895, 'IAO_0000115'].replace("\\n", "") df.loc[2934, 'IAO_0000115'] = df.loc[2934, 'IAO_0000115'].replace("\\n", "") df.loc[3036, 'IAO_0000115'] = df.loc[3036, 'IAO_0000115'].replace("\\n", "") - df.loc[11305, 'IAO_0000115'] = df.loc[11305, - 'IAO_0000115'].replace("\\n", "") return df -def wrapper_fun(file_input, file_output): - file_input = sys.argv[1] - file_output = sys.argv[2] +def wrapper_fun(file_input): # Read disease ontology .owl file tree = ElementTree.parse(file_input) # Get file root @@ -249,7 +250,7 @@ def wrapper_fun(file_input, file_output): # Parse owl classes to human-readble dictionary format parsed_owl_classes = [] for owl_class in all_classes: - info = list(owl_class.getiterator()) + info = list(owl_class.iter()) parsed_owl_classes.append(parse_do_info(info)) # Convert to pandas Dataframe df_do = pd.DataFrame(parsed_owl_classes) @@ -269,19 +270,24 @@ def wrapper_fun(file_input, file_output): df_do = df_do.drop_duplicates(subset='id', keep="last") df_do = df_do.reset_index(drop=True) df_do = df_do.replace('"nan"', np.nan) + df_do = df_do.replace("nan", np.nan) #generate dcids df_do['id'] = "bio/" + df_do['id'] - ##df_do.loc[2505, 'IAO_0000115'] = df_do.loc[2505, 'IAO_0000115'].replace("\\n", "") - df_do = remove_newline(df_do) - df_do['IAO_0000115'] = df_do['IAO_0000115'].str.replace("_", " ") - df_do.to_csv(file_output) - + df_do['subClassOf'] = "bio/" + df_do['subClassOf'] + df_do['ICD9CM'] = "ICD10/" + df_do['ICD9CM'].apply(str) + return df_do def main(): file_input = sys.argv[1] file_output = sys.argv[2] - wrapper_fun(file_input, file_output) + df = wrapper_fun(file_input) + df = remove_newline(df) + df['IAO_0000115'] = df['IAO_0000115'].str.replace("_", " ") + df.to_csv(file_output) if __name__ == '__main__': main() + + + diff --git a/scripts/biomedical/diseaseOntology/unit-tests/test-do.xml b/scripts/biomedical/diseaseOntology/unit-tests/test-do.xml new file mode 100644 index 0000000000..244b920bc9 --- /dev/null +++ b/scripts/biomedical/diseaseOntology/unit-tests/test-do.xml @@ -0,0 +1,1758 @@ + + + + + + The Disease Ontology has been developed as a standardized ontology for human disease with the purpose of providing the biomedical community with consistent, reusable and sustainable descriptions of human disease terms, phenotype characteristics and related medical vocabulary disease concepts. + Human Disease Ontology + + disease_ontology + 1.2 + lschriml + The Disease Ontology content is available via the Creative Commons Public Domain Dedication CC0 1.0 Universal license (https://creativecommons.org/publicdomain/zero/1.0/). + + + + + + + + + + + + + + + + + Formal citation, e.g. identifier in external database to indicate / attribute source(s) for the definition. Free text indicate / attribute source(s) for the definition. EXAMPLE: Author Name, URI, MeSH Term C04, PUBMED ID, Wiki uri on 31.01.2007. + + definition source + + + + + Formal citation, e.g. identifier in external database to indicate / attribute source(s) for the definition. Free text indicate / attribute source(s) for the definition. EXAMPLE: Author Name, URI, MeSH Term C04, PUBMED ID, Wiki uri on 31.01.2007. + url:http://purl.obolibrary.org/obo/iao.owl + + + + + + + + Has ontology root term. + has_ontology_root_term + + + + + Has ontology root term. + url:http://purl.obolibrary.org/obo/IAO_0000700 + + + + + + + + + + + + + + DO_AGR_slim + DO_AGR_slim + + + + + + + + + DO_FlyBase_slim + DO_FlyBase_slim + + + + + + + + + DO_GXD_slim + DO_GXD_slim + + + + + + + + + DO_IEDB_slim + DO_IEDB_slim + + + + + + + + + DO_MGI_slim + DO_MGI_slim + + + + + + + + + DO_RAD_slim + DO_RAD_slim + + + + + + + + + DO_cancer_slim + DO_cancer_slim + + + + + + + + + DO_rare_slim + DO_rare_slim + + + + + + + + + GOLD + GOLD + + + + + + + + + NCIthesaurus + NCIthesaurus + + + + + + + + + TopNodes_DOcancerslim + TopNodes_DOcancerslim + + + + + + + + + gram-negative_bacterial_infectious_disease + gram-negative_bacterial_infectious_disease + + + + + + + + + gram-positive_bacterial_infectious_disease + gram-positive_bacterial_infectious_disease + + + + + + + + + sexually_transmitted_infectious_disease + sexually_transmitted_infectious_disease + + + + + + + + + tick-borne_infectious_disease + tick-borne_infectious_disease + + + + + + + + + zoonotic_infectious_disease + zoonotic_infectious_disease + + + + + + + + + dc:date + + + + + + + + Description. + description + + + + + Description. + url:http://purl.org/dc/elements/1.1/description + + + + + + + + Title. + title + + + + + Title. + url:http://purl.org/dc/elements/1.1/title + + + + + + + + The dc:type. + dc:type + + + + + The dc:type. + url:http://purl.org/dc/elements/1.1/type + + + + + + + + License. + license + + + + + License. + url:http://purl.org/dc/terms/license + + + + + + + + Subset property, name of subet. + subset_property + + + + + Subset property, name of subet. + url:http://www.geneontology.org/formats/oboInOwl#SubsetProperty + + + + + + + + auto-generated-by + + + + + + + + Author of the class. + created_by + + + + + Author of the class. + url:http://www.geneontology.org/formats/oboInOwl#created_by + + + + + + + + Date class was created. + creation_date + + + + + Date class was created. + url:http://www.geneontology.org/formats/oboInOwl#creation_date + + + + + + + + Date. + date + + + + + Date. + url:http://purl.org/dc/elements/1.1/date + + + + + + + + Default namespace. + default-namespace + + + + + Default namespace. + url:http://www.geneontology.org/formats/oboInOwl#default-namespace + + + + + + + + ID of merged class. + has_alternative_id + + + + + ID of merged class. + url:http://www.geneontology.org/formats/oboInOwl#hasAlternativeId + + + + + + + + Broad synonym. + has_broad_synonym + + + + + Broad synonym. + url:http://www.geneontology.org/formats/oboInOwl#hasBroadSynonym + + + + + + + + Reference database or publication source. + database_cross_reference + + + + + Reference database or publication source. + url:http://www.geneontology.org/formats/oboInOwl#hasDbXref + + + + + + + + Exact synonym. + has_exact_synonym + + + + + Exact synonym. + url:http://www.geneontology.org/formats/oboInOwl#hasExactSynonym + + + + + + + + Narrow synonym. + has_narrow_synonym + + + + + Narrow synonym. + url:http://www.geneontology.org/formats/oboInOwl#hasNarrowSynonym + + + + + + + + Has OBO format version. + has_obo_format_version + + + + + Has OBO format version. + url:http://www.geneontology.org/formats/oboInOwl#hasOBOFormatVersion + + + + + + + + Name space of the ontology. + disease_ontology + has_obo_namespace + + + + + Name space of the ontology. + url:http://www.geneontology.org/formats/oboInOwl#hasOBONamespace + + + + + + + + Has related synonym. + has_related_synonym + + + + + Has related synonym. + url:http://www.geneontology.org/formats/oboInOwl#hasRelatedSynonym + + + + + + + + An identifier is an information content entity that is the outcome of a dubbing process and is used to refer to one instance of entity shared by a group of people to refer to that individual entity. + id + + + + + An identifier is an information content entity that is the outcome of a dubbing process and is used to refer to one instance of entity shared by a group of people to refer to that individual entity. + url:http://purl.obolibrary.org/obo/IAO_0020000 + + + + + + + + In subset. + in_subset + + + + + In subset. + url:http://www.geneontology.org/formats/oboInOwl#inSubset + + + + + + + + Saved by. + saved-by + + + + + Saved by. + url:http://www.geneontology.org/formats/oboInOwl#saved-by + + + + + + + + Comment. + comment + + + + + Comment. + url:http://www.w3.org/2000/01/rdf-schema#comment + + + + + + + + Is defined by. + rdfs:isDefinedBy + + + + + Is defined by. + url:http://www.w3.org/2000/01/rdf-schema#isDefinedBy + + + + + + + + A human readable name for this class. + + + + + A human readable name for this class. + url:http://www.w3.org/2000/01/rdf-schema#label + url:https://www.w3.org/TR/owl-guide/ + + + + + + + + OWL deprecated. + owl:deprecated + + + + + OWL deprecated. + url:http://www.w3.org/2002/07/owl#deprecated + + + + + + + + Max qualified cardinality. + owl:maxQualifiedCardinality + + + + + Max qualified cardinality. + url:http://www.w3.org/2002/07/owl#maxQualifiedCardinality + + + + + + + + Min qualified cardinality. + owl:minQualifiedCardinality + + + + + Min qualified cardinality. + url:http://www.w3.org/2002/07/owl#minQualifiedCardinality + + + + + + + + Qualified cardinality. + owl:qualifiedCardinality + + + + + Qualified cardinality. + url:http://www.w3.org/2002/07/owl#qualifiedCardinality + + + + + + + + A skos concept mapping used to state a hierarchical mapping link between two concepts where the DO concept is broad and the xref is narrow (represents subtypes). + has broader match + + + + + A skos concept mapping used to state a hierarchical mapping link between two concepts where the DO concept is broad and the xref is narrow (represents subtypes). + url:https://www.w3.org/2009/08/skos-reference/skos.html#broadMatch + + + + + + + + A skos concept mapping used to link two concepts that are sufficiently similar that they can be used interchangeably. + has close match + + + + + A skos concept mapping used to link two concepts that are sufficiently similar that they can be used interchangeably. + url:https://www.w3.org/2009/08/skos-reference/skos.html#closeMatch + + + + + + + + A skos concept mapping used to link two concepts, indicating a high degree of confidence that the concepts can be used interchangeably. + has exact match + + + + + A skos concept mapping used to link two concepts, indicating a high degree of confidence that the concepts can be used interchangeably. + url:https://www.w3.org/2009/08/skos-reference/skos.html#exactMatch + + + + + + + + A skos concept mapping used to state a hierarchical mapping link between two concepts where the DO concept is narrow (DO represents subtypes) and the xref is broad. + has narrower match + + + + + A skos concept mapping used to state a hierarchical mapping link between two concepts where the DO concept is narrow (DO represents subtypes) and the xref is broad. + url:https://www.w3.org/2009/08/skos-reference/skos.html#narrowMatch + + + + + + + + A skos concept mapping used to state an associative mapping link between two concepts. + has related match + + + + + A skos concept mapping used to state an associative mapping link between two concepts. + https://www.w3.org/2009/08/skos-reference/skos.html#relatedMatch + + + + + + + + + + + + + has material basis in + + + + + + + + Type of tissue or cell/the source of the material. + disease_ontology + RO:0001000 + derives_from + + + + + Type of tissue or cell/the source of the material. + DO:lh + + + + + + + + + A spatial quality inhering in a bearer by virtue of the bearer's being located near in space in relation to another entity. + adjacent_to + + + + + + + + A relation that holds between a disease or an organism and a symptom. Symptom(s) associated with a disease. + disease_ontology + RO:0002452 + has_symptom + + + + + + + + Relation defining child to partent inheritance type. + disease_ontology + is_a + + + + + Relation defining child to partent inheritance type. + url:http://geneontology.org/docs/ontology-relations/ + url:http://purl.obolibrary.org/obo/rex#is_a + + + + + + + + + + + + A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels. + DOID:267 + DOID:4508 + ICDO:9120/3 + MESH:D006394 + NCI:C3088 + NCI:C9275 + SNOMEDCT_US_2020_09_01:39000009 + UMLS_CUI:C0018923 + UMLS_CUI:C0854893 + hemangiosarcoma + disease_ontology + DOID:0001816 + + + angiosarcoma + + + + + A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels. + + + + url:http://en.wikipedia.org/wiki/Hemangiosarcoma + url:https://en.wikipedia.org/wiki/Angiosarcoma + url:https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&ns=ncit&code=C3088 + url:https://www.ncbi.nlm.nih.gov/pubmed/23327728 + + + + + + + + + An autoimmune disease of endocrine system that is located_in the pancreas. + MESH:D000081012 + SNOMEDCT_US_2020_09_01:448542008 + UMLS_CUI:C2609129 + disease_ontology + DOID:0040091 + + autoimmune pancreatitis + MESH:D000081012 + + + + + An autoimmune disease of endocrine system that is located_in the pancreas. + + url:https://www.ncbi.nlm.nih.gov/pubmed/19940298 + IEDB:RV + + + + + + disease_ontology + DOID:0050001 + + obsolete Actinomadura madurae infectious disease + true + + + + + + + MESH:D002292 + NCI:C27893 + SNOMEDCT_US_2020_09_01:128668003 + UMLS_CUI:C1266043 + renal cell carcinoma, spindle cell + disease_ontology + DOID:4473 + + sarcomatoid renal cell carcinoma + + + + + + + A bacterial infectious disease has_material_basis_in Bacteria. + ICD9CM:995.91 + SNOMEDCT_US_2020_03_01:10001005 + disease_ontology + DOID:0040085 + + bacterial sepsis + + + + + A bacterial infectious disease has_material_basis_in Bacteria. + + url:https://www.ncbi.nlm.nih.gov/pubmed/20421654 + IEDB:RV + + + + + + + A pemphigus that is characterized by blistered skin as a result of self-reactive T and B cells that target BP180. + ICD10CM:O26.4 + MESH:D006559 + SNOMEDCT_US_2018_03_01:86081009 + disease_ontology + DOID:0040098 + + pemphigus gestationis + + + + + A pemphigus that is characterized by blistered skin as a result of self-reactive T and B cells that target BP180. + + url:https://www.ncbi.nlm.nih.gov/pubmed/16552711 + IEDB:RV + + + + + + + A cephalosporin allergy that has_allergic_trigger ceftriaxone. + SNOMEDCT_US_2020_09_01:294551009 + UMLS_CUI:C0571463 + rocephin allergy + disease_ontology + DOID:0040005 + + ceftriaxone allergy + + + + + A cephalosporin allergy that has_allergic_trigger ceftriaxone. + + url:https://www.ncbi.nlm.nih.gov/pubmed/12833570 + IEDB:RV + + + + + + + A spotted fever that has_material_basis_in Rickettsia conorii subsp israelensis, which is transmitted_by ticks (Rhipicephalus sanguineus). The infection has_symptom fever, has_symptom eschar, has_symptom regional adenopathy, and has_symptom maculopapular rash on extremities. + disease_ontology + Israeli spotted fever + DOID:0050043 + + + + Israeli tick typhus + + + + + A spotted fever that has_material_basis_in Rickettsia conorii subsp israelensis, which is transmitted_by ticks (Rhipicephalus sanguineus). The infection has_symptom fever, has_symptom eschar, has_symptom regional adenopathy, and has_symptom maculopapular rash on extremities. + + + url:http://www.biomedcentral.com/1471-2180/5/11 + url:http://www.cdc.gov/otherspottedfever/index.html + + + + + + + A drug allergy that has_allergic_trigger carbamazepine. + SNOMEDCT_US_2020_09_01:293867002 + UMLS_CUI:C0570787 + Tegretol allergy + carbamazepen allergy + disease_ontology + DOID:0040006 + + carbamazepine allergy + + + + + A drug allergy that has_allergic_trigger carbamazepine. + + url:https://www.ncbi.nlm.nih.gov/pubmed/7602118 + IEDB:RV + + + + + + + An immune system disease that is an exaggerated immune response to allergens, such as insect venom, dust mites, pollen, pet dander, drugs or some foods. + allergic disease + ICD10CM:T78.40 + MESH:D006967 + NCI:C3114 + SNOMEDCT_US_2020_09_01:257550005 + UMLS_CUI:C0020517 + allergic hypersensitivity disease + hypersensitivity + hypersensitivity reaction type I disease + disease_ontology + DOID:1205 + + allergic disease + + + + + An immune system disease that is an exaggerated immune response to allergens, such as insect venom, dust mites, pollen, pet dander, drugs or some foods. + + url:http://en.wikipedia.org/wiki/Allergy + ls:IEDB + + + + + + + A myopathy that is characterized by the presence of tubular aggregates in myofibrils and has_material_basis_in heterozygous mutation in the STIM1 gene on chromosome 11p15. + lschriml + 2015-11-10T16:53:39Z + GARD:3884 + OMIM:160565 + disease_ontology + DOID:0080089 + tubular aggregate myopathy 1 + ORDO:2593 + + + + + A myopathy that is characterized by the presence of tubular aggregates in myofibrils and has_material_basis_in heterozygous mutation in the STIM1 gene on chromosome 11p15. + url:https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5133946/ + + + + + + + A primary ciliary dyskinesia that is characterized by sinusitis, bronchiectasis and situs inversus with dextrocardia resulting from dysfunction of the cilia during embryologic development. + GARD:6815 + MESH:D007619 + NCI:C84797 + ORDO:98861 + SNOMEDCT_US_2020_03_01:42402006 + Kartagener's syndrome + disease_ontology + DOID:0050144 + + + Kartagener syndrome + + + + + A primary ciliary dyskinesia that is characterized by sinusitis, bronchiectasis and situs inversus with dextrocardia resulting from dysfunction of the cilia during embryologic development. + + + + url:http://en.wikipedia.org/wiki/Situs_inversus#Kartagener_syndrome + url:http://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=14.10d&code=C84797 + url:http://rarediseases.info.nih.gov/gard/6815/kartagener-syndrome/resources/1 + url:https://www.ncbi.nlm.nih.gov/pubmed/19529061 + url:https://www.ncbi.nlm.nih.gov/pubmed/23243352 + url:https://www.ncbi.nlm.nih.gov/pubmed/24019633 + url:https://www.ncbi.nlm.nih.gov/pubmed/25633235 + + + + + + + A small intestine cancer that has_material_basis_in cells of the neuroendocrine system. + lschriml + 2014-10-21T12:58:08Z + 'intestinal carcinoid tumour' + OMIM:114900 + disease_ontology + DOID:0050925 + + small intestine carcinoid neuroendocrine tumor + + + + + A small intestine cancer that has_material_basis_in cells of the neuroendocrine system. + + url:http://en.wikipedia.org/wiki/Carcinoid + + + + + + + A pulmonary fibrosis that is characterized by scarring of the lung. + EFO:0000768 + GARD:8609 + ICD10CM:J84.112 + ICD9CM:516.31 + MESH:D054990 + NCI:C35716 + OMIM:178500 + SNOMEDCT_US_2020_09_01:28168000 + UMLS_CUI:C1800706 + FIBROCYSTIC PULMONARY DYSPLASIA + IDIOPATHIC PULMONARY FIBROSIS, FAMILIAL + cryptogenic fibrosing alveolitis + disease_ontology + DOID:0050156 + + OMIM mapping confirmed by DO. [SN]. + idiopathic pulmonary fibrosis + + + + + A pulmonary fibrosis that is characterized by scarring of the lung. + url:https://www.pulmonaryfibrosis.org/life-with-pf/about-ipf + + + + + + + A diabetes mellitus that has_material_basis_in autosomal dominant inheritance of mutations in the MODY genes impacting beta-cell function, typically occurring before 25 years of age and caused by primary insulin secretion defects. + GARD:3697 + ICD10CM:E11.8 + KEGG:04950 + OMIM:606391 + ORDO:552 + MODY + Mason-type diabetes + disease_ontology + DOID:0050524 + Xref MGI. +OMIM mapping confirmed by DO. [SN]. + maturity-onset diabetes of the young + + + + + A diabetes mellitus that has_material_basis_in autosomal dominant inheritance of mutations in the MODY genes impacting beta-cell function, typically occurring before 25 years of age and caused by primary insulin secretion defects. + + url:http://en.wikipedia.org/wiki/Maturity_onset_diabetes_of_the_young + + + + + + + An inflammatory bowel disease characterized by inflammation located_in ileum, has_symptom diarrhea, has_symptom abdominal pain, often in the right lower quadrant, has_symptom weight loss. + emitraka + 2014-09-15T11:40:07Z + ICD10CM:K52.9 + MEDDRA:10021312 + MESH:D007079 + NCI:C84782 + SNOMEDCT_US_2020_09_01:52457000 + UMLS_CUI:C0020877 + Crohn's ileitis + disease_ontology + DOID:0060189 + + ileitis + + + + + An inflammatory bowel disease characterized by inflammation located_in ileum, has_symptom diarrhea, has_symptom abdominal pain, often in the right lower quadrant, has_symptom weight loss. + + url:http://en.wikipedia.org/wiki/Ileitis + url:http://www.ccfa.org/what-are-crohns-and-colitis/what-is-crohns-disease/types-of-crohns-disease.html + + + + + + + An agnosia that is a loss of the ability to map out physical actions in order to repeat them in functional activities. + lschriml + 2011-08-22T12:04:56Z + GARD:5838 + ICD10CM:R48.2 + SNOMEDCT_US_2019_09_01:68345001 + disease_ontology + DOID:0060135 + apraxia + MESH:D001072 + + + + + An agnosia that is a loss of the ability to map out physical actions in order to repeat them in functional activities. + + url:http://en.wikipedia.org/wiki/Agnosia + + + + + + + + A 3MC syndrome that has_material_basis_in autosomal recessive inheritance of homozygous mutation in the collectin subfamily member 11 gene (COLEC11) on chromosome 2p25. + OMIM:265050 + disease_ontology + DOID:0060576 + 3MC syndrome 2 + + + + + A 3MC syndrome that has_material_basis_in autosomal recessive inheritance of homozygous mutation in the collectin subfamily member 11 gene (COLEC11) on chromosome 2p25. + + + url:https://ghr.nlm.nih.gov/condition/3mc-syndrome + url:https://www.omim.org/entry/265050 + + + + + + + A tuberculosis located in the heart. + MESH:D014381 + cardiovascular tuberculosis + disease_ontology + DOID:0060570 + cardiac tuberculosis + + + + + A tuberculosis located in the heart. + url:https://pubmed.ncbi.nlm.nih.gov/28814447/ + + + + + + + A prostate carcinoma that is characterized by continued growth and spread despite the surgical removal of the testes or medical intervention to block androgen production. + MESH:D064129 + NCI:C130234 + disease_ontology + DOID:0080909 + castration-resistant prostate carcinoma + MESH:D064129 + + + + + A prostate carcinoma that is characterized by continued growth and spread despite the surgical removal of the testes or medical intervention to block androgen production. + url:https://www.auanet.org/guidelines/prostate-cancer-castration-resistant-guideline + + + + + + + A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation. + laronhughes + 2010-06-30T02:44:30Z + UMLS_CUI:C0033999 + surfer's eye + disease_ontology + DOID:0002116 + pterygium + + + + + A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation. + + url:https://en.wikipedia.org/wiki/Pterygium_(conjunctiva) + + + + + + + A disease that involving errors in metabolic processes of building or degradation of molecules. + ICD10CM:E88.9 + ICD9CM:277.9 + MESH:D008659 + NCI:C3235 + SNOMEDCT_US_2020_09_01:75934005 + UMLS_CUI:C0025517 + metabolic disease + disease_ontology + DOID:0014667 + + + + disease of metabolism + + + + + A disease that involving errors in metabolic processes of building or degradation of molecules. + + url:http://www.ncbi.nlm.nih.gov/books/NBK22259/ + + + + + + + A renal cell carcinoma that has_material_basis_in cells that appear very pale or clear when examined under microscope. + GARD:9574 + MESH:D002292 + NCI:C4033 + SNOMEDCT_US_2020_09_01:254915003 + UMLS_CUI:C0279702 + Clear cell carcinoma of kidney + clear cell kidney carcinoma + conventional (Clear cell) renal cell carcinoma + conventional renal cell carcinoma + renal clear cell carcinoma + disease_ontology + Clear-cell metastatic renal cell carcinoma + Clear-cell metastatic renal cell carcinoma + DOID:4467 + + + MESH:C538445 added from NeuroDevNet [WAK]. + clear cell renal cell carcinoma + + + + + A renal cell carcinoma that has_material_basis_in cells that appear very pale or clear when examined under microscope. + + + url:http://www.cancer.gov/dictionary?CdrID=45063 + url:https://cancergenome.nih.gov/cancersselected/kidneyclearcell + + + + + Clear-cell metastatic renal cell carcinoma + MESH:C538445 + + + + + + + + A crustacean allergy that has_allergic_trigger shrimp. + disease_ontology + DOID:0040001 + + shrimp allergy + + + + + A crustacean allergy that has_allergic_trigger shrimp. + + url:https://www.ncbi.nlm.nih.gov/pubmed/20471069 + IEDB:RV + + + + + Has obsolescence reason. + has_obsolescence_reason + + + + + Has obsolescence reason. + url:http://purl.obolibrary.org/obo/IAO_0000231 + + + rdfs:seeAlso + See also. + + + + + See also. + url:http://www.w3.org/2000/01/rdf-schema#seeAlso + + + owl:backwardCompatibleWith + Backward compatible with. + + + + + Backward compatible with. + url:http://www.w3.org/2002/07/owl#backwardCompatibleWith + + + owl:incompatibleWith + Incompatible with. + + + + + Incompatible with. + url:http://www.w3.org/2002/07/owl#incompatibleWith + + + owl:priorVersion + Prior version. + + + + + Prior version. + url:http://www.w3.org/2002/07/owl#priorVersion + + + Version info. + owl:versionInfo + + + + + Version info. + url:http://www.w3.org/2002/07/owl#versionInfo + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/scripts/biomedical/diseaseOntology/unit-tests/test-output.csv b/scripts/biomedical/diseaseOntology/unit-tests/test-output.csv new file mode 100644 index 0000000000..cc3f2eff07 --- /dev/null +++ b/scripts/biomedical/diseaseOntology/unit-tests/test-output.csv @@ -0,0 +1,25 @@ +ICD10_id,element,MESH_id,element_name,subClassOf,IAO_0000115,hasAlternativeId,hasExactSynonym,id,label,ICDO,MESH,NCI,SNOMEDCTUS20200901,UMLSCUI,ICD9CM,SNOMEDCTUS20200301,ICD10CM,SNOMEDCTUS20180301,GARD,OMIM,ORDO,EFO,MEDDRA,SNOMEDCTUS20190901 +,,,,bio/DOID_175,A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.,"""DOID:267, DOID:4508""","""hemangiosarcoma""",bio/DOID_0001816,"""angiosarcoma""",,,,,C0854893,ICD10/nan,,,,,,,,, +,,,,bio/DOID_0060005,An autoimmune disease of endocrine system that is located_in the pancreas.,,,bio/DOID_0040091,"""autoimmune pancreatitis""",,,,,C2609129,ICD10/nan,,,,,,,,, +,,,,,,,,bio/DOID_0050001,"""obsolete Actinomadura madurae infectious disease""",,,,,,ICD10/nan,,,,,,,,, +,,,,bio/DOID_4450,,,"""renal cell carcinoma, spindle cell""",bio/DOID_4473,"""sarcomatoid renal cell carcinoma""",,,,,C1266043,ICD10/nan,,,,,,,,, +,,,,bio/DOID_104,A bacterial infectious disease has_material_basis_in Bacteria.,,,bio/DOID_0040085,"""bacterial sepsis""",,,,,,ICD10/nan,"""SNOMEDCTUS20200301:10001005""",,,,,,,, +,,,,bio/DOID_9182,A pemphigus that is characterized by blistered skin as a result of self-reactive T and B cells that target BP180.,,,bio/DOID_0040098,"""pemphigus gestationis""",,,,,,ICD10/nan,,,"""SNOMEDCTUS20180301:86081009""",,,,,, +,,,,bio/DOID_0040021,A cephalosporin allergy that has_allergic_trigger ceftriaxone.,,"""rocephin allergy""",bio/DOID_0040005,"""ceftriaxone allergy""",,,,,C0571463,ICD10/nan,,,,,,,,, +,,,,bio/DOID_11104,"A spotted fever that has_material_basis_in Rickettsia conorii subsp israelensis, which is transmitted_by ticks Rhipicephalus sanguineus. The infection has_symptom fever, has_symptom eschar, has_symptom regional adenopathy, and has_symptom maculopapular rash on extremities.",,,bio/DOID_0050043,"""Israeli tick typhus""",,,,,,ICD10/nan,,,,,,,,, +,,,,bio/DOID_0060500,A drug allergy that has_allergic_trigger carbamazepine.,,"""Tegretol allergy, carbamazepen allergy""",bio/DOID_0040006,"""carbamazepine allergy""",,,,,C0570787,ICD10/nan,,,,,,,,, +,,,,bio/DOID_2914,"An immune system disease that is an exaggerated immune response to allergens, such as insect venom, dust mites, pollen, pet dander, drugs or some foods.",,"""allergic hypersensitivity disease, hypersensitivity, hypersensitivity reaction type I disease""",bio/DOID_1205,"""allergic disease""",,,,,C0020517,ICD10/nan,,,,,,,,, +,,,,bio/DOID_423,A myopathy that is characterized by the presence of tubular aggregates in myofibrils and has_material_basis_in heterozygous mutation in the STIM1 gene on chromosome 11p15.,,,bio/DOID_0080089,"""tubular aggregate myopathy 1""",,,,,,ICD10/nan,,,,,"""OMIM:160565""",,,, +,,,,bio/DOID_9562,"A primary ciliary dyskinesia that is characterized by sinusitis, bronchiectasis and situs inversus with dextrocardia resulting from dysfunction of the cilia during embryologic development.",,"""Kartageners syndrome""",bio/DOID_0050144,"""Kartagener syndrome""",,,,,,ICD10/nan,"""SNOMEDCTUS20200301:42402006""",,,,,,,, +,,,,bio/DOID_10154,A small intestine cancer that has_material_basis_in cells of the neuroendocrine system.,,,bio/DOID_0050925,"""small intestine carcinoid neuroendocrine tumor""",,,,,,ICD10/nan,,,,,"""OMIM:114900""",,,, +,,,,bio/DOID_3770,A pulmonary fibrosis that is characterized by scarring of the lung.,,"""FIBROCYSTIC PULMONARY DYSPLASIA, IDIOPATHIC PULMONARY FIBROSIS, FAMILIAL, cryptogenic fibrosing alveolitis""",bio/DOID_0050156,"""idiopathic pulmonary fibrosis""",,,,,C1800706,ICD10/nan,,,,,,,,, +,,,,bio/DOID_9351,"A diabetes mellitus that has_material_basis_in autosomal dominant inheritance of mutations in the MODY genes impacting beta-cell function, typically occurring before 25 years of age and caused by primary insulin secretion defects.",,"""MODY, Mason-type diabetes""",bio/DOID_0050524,"""maturity-onset diabetes of the young""",,,,,,ICD10/nan,,,,,,"""ORDO:552""",,, +,,,,bio/DOID_8778,"An inflammatory bowel disease characterized by inflammation located_in ileum, has_symptom diarrhea, has_symptom abdominal pain, often in the right lower quadrant, has_symptom weight loss.",,"""Crohns ileitis""",bio/DOID_0060189,"""ileitis""",,,,,C0020877,ICD10/nan,,,,,,,,, +,,,,bio/DOID_4090,An agnosia that is a loss of the ability to map out physical actions in order to repeat them in functional activities.,,,bio/DOID_0060135,"""apraxia""",,,,,,ICD10/nan,,,,,,,,,"""SNOMEDCTUS20190901:68345001""" +,,,,bio/DOID_0060225,A 3MC syndrome that has_material_basis_in autosomal recessive inheritance of homozygous mutation in the collectin subfamily member 11 gene COLEC11 on chromosome 2p25.,,,bio/DOID_0060576,"""3MC syndrome 2""",,,,,,ICD10/nan,,,,,"""OMIM:265050""",,,, +,,bio/D014381,D014381,bio/DOID_399,A tuberculosis located in the heart.,,,bio/DOID_0060570,"""cardiac tuberculosis""",,"""MESH:D014381""",,,,ICD10/nan,,,,,,,,, +,,,,bio/DOID_10286,A prostate carcinoma that is characterized by continued growth and spread despite the surgical removal of the testes or medical intervention to block androgen production.,,,bio/DOID_0080909,"""castration-resistant prostate carcinoma""",,,"""NCI:C130234""",,,ICD10/nan,,,,,,,,, +,,,,bio/DOID_10124,A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation.,,"""surfers eye""",bio/DOID_0002116,"""pterygium""",,,,,C0033999,ICD10/nan,,,,,,,,, +,,,,bio/DOID_4,A disease that involving errors in metabolic processes of building or degradation of molecules.,,"""metabolic disease""",bio/DOID_0014667,"""disease of metabolism""",,,,,C0025517,ICD10/nan,,,,,,,,, +,,,,bio/DOID_4450,A renal cell carcinoma that has_material_basis_in cells that appear very pale or clear when examined under microscope.,,"""Clear cell carcinoma of kidney, clear cell kidney carcinoma, conventional Clear cell renal cell carcinoma, conventional renal cell carcinoma, renal clear cell carcinoma""",bio/DOID_4467,"""clear cell renal cell carcinoma""",,,,,C0279702,ICD10/nan,,,,,,,,, +,,,,bio/DOID_0060524,A crustacean allergy that has_allergic_trigger shrimp.,,,bio/DOID_0040001,"""shrimp allergy""",,,,,,ICD10/nan,,,,,,,,, \ No newline at end of file From 702e9be36a82715dc8e42f9768e969c47cf7b162 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 25 Jul 2022 23:26:44 -0700 Subject: [PATCH 10/28] Update README.md --- scripts/biomedical/diseaseOntology/README.md | 24 +++++++++----------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/README.md b/scripts/biomedical/diseaseOntology/README.md index 8df20c6d07..dbf00a194e 100644 --- a/scripts/biomedical/diseaseOntology/README.md +++ b/scripts/biomedical/diseaseOntology/README.md @@ -5,38 +5,36 @@ - [Importing the Disease Ontology (DO) data](#importing-the-disease-ontology-do-data) - [Table of Contents](#table-of-contents) - [About the Dataset](#about-the-dataset) - - [Download URL](#download-url) + - [Download Data](#download-data) - [Overview](#overview) - - [Schema Overview](#schema-overview) - [Notes and Caveats](#notes-and-caveats) + - [License](#license) - [About the import](#about-the-import) - [Artifacts](#artifacts) - [Scripts](#scripts) + - [Files](#files) - [Examples](#examples) + -[Run Tests](#run-tests) + -[Import](#import) ## About the Dataset +[Disease Ontology](https://disease-ontology.org) (DO) is a standardized ontology for human disease that was developed "with the purpose of providing the biomedical community with consistent, reusable and sustainable descriptions of human disease terms, phenotype characteristics and related medical vocabulary disease concepts through collaborative efforts of biomedical researchers, coordinated by the University of Maryland School of Medicine, Institute for Genome Sciences. -### Download URL +The Disease Ontology semantically integrates disease and medical vocabularies through extensive cross mapping of DO terms to MeSH, ICD, NCI’s thesaurus, SNOMED and OMIM." + +### Download Data The human disease ontology data can be downloaded from their official github repository [here](https://www.vmh.life/#human/all). The data is in `.owl` format and had to be parsed into a `.csv` format (see [Notes and Caveats](#notes-and-caveats) for additional information on formatting). ### Overview -The Disease Ontology database provides a standardized ontology for human diseases, for the purposes of consistency and reusability. It contains extensive cross mapping of DO terms to other databases, namely, MeSH, ICD, NCI’s thesaurus, SNOMED and OMIM. More information on the database can be found [here](https://disease-ontology.org). - -This directory stores the script used to convert the dataset obtained from DO into a modified version, for effective ingestion of data into the Data Commons knowledge graph. - -### Schema Overview - -The schema representing reaction, metabolite and microbiome data from VMH is defined in [DO.mcf](https://raw.githubusercontent.com/suhana13/ISB-project/main/combined_list.mcf) and [DO.mcf](https://raw.githubusercontent.com/suhana13/ISB-project/main/combined_list_enum.mcf). - -This dataset contains several instances of the class `Disease` and it has multiple properties namely, "parent", "diseaseDescription", "alternativeDOIDs", "diseaseSynonym", "commonName", "icdoID", "meshID", "nciID", "snowmedctusID", "umlscuiID", "icd10CMID", "icd9CMID", "orDOID", "gardID", "omimID", "efoID", "keggDiseaseID", and "medDraID" +This directory stores the script used to download, clean, and convert the Disease Ontology data into a `.csv` format, which is ready for ingestion into the Data Commons knowledge graph alongside a `.tmcf` file that maps the `.csv` to the defined schema. In this import the data is ingested as [Disease](https://datacommons.org/browser/Disease) entities into the graph. ### Notes and Caveats The original format of the data was `.owl` and it was converted to a `.csv` file prior to ingestion into Data Commons. -- ### License +### License This data is under a Creative Commons Public Domain Dedication [CC0 1.0 Universal license](https://disease-ontology.org/resources/do-resources). From 75f925689bfa900c7699e67ce8e56753a606dba0 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 25 Jul 2022 23:43:57 -0700 Subject: [PATCH 11/28] Update README.md Add About the Import section --- scripts/biomedical/diseaseOntology/README.md | 43 +++++++++++++++++++- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/README.md b/scripts/biomedical/diseaseOntology/README.md index dbf00a194e..5636a239c2 100644 --- a/scripts/biomedical/diseaseOntology/README.md +++ b/scripts/biomedical/diseaseOntology/README.md @@ -44,11 +44,50 @@ This data is under a Creative Commons Public Domain Dedication [CC0 1.0 Universa #### Scripts +##### Shell Script + +`download.sh` + +##### Python Script + `format_disease_ontology.py` -## Examples +##### Test Script + +`format_disease_ontology_test.py` + +#### Files + +##### Test File + +`input_file.txt` + +`expected_output_file.txt` + +##### tMCF File + +`my_tmcf_file.tmcf` + + +### Examples + +#### Run Tests + +To test format_refseq_chromosome_id_to_dcid.py run: + +``` +python format_disease_ontology.py input_file.owl expected_output.csv +``` + +#### Import + +1. Download data to scratch/. + +``` +bash download.sh +``` -To generate the formatted csv file from owl: +2. Clean and convert the downloaded Disease Ontology data into `.csv` format ``` python format_disease_ontology.py humanDO.owl humanDO.csv From 1329557b156e02ad6b6b462ba3ba41cd883cc61e Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Sun, 31 Jul 2022 19:44:00 -0700 Subject: [PATCH 12/28] Update .tmcf --- .../diseaseOntology/disease_ontology.tmcf | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf index c86accf1d1..c03152bfa7 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -1,23 +1,23 @@ Node: E:DiseaseOntology->E1 typeOf: dcs:Disease dcid: C:DiseaseOntology->dcid +name: C:DiseaseOntology->label parent: C:DiseaseOntology->subClassOf -diseaseDescription: C:DiseaseOntology->IAO_0000115 -alternativeDOIDs : C:DiseaseOntology->hasAlternativeId +description: C:DiseaseOntology->IAO_0000115 +alternativeDiseaseOntologyID : C:DiseaseOntology->hasAlternativeId diseaseSynonym: C:DiseaseOntology->hasExactSynonym -commonName: C:DiseaseOntology->label -icdoID: C:DiseaseOntology->ICDO -meshID: C:DiseaseOntology->MESH -nciID: C:DiseaseOntology->NCI -snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20200901 -snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20200301 -snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20180301 -snowmedctusID: C:DiseaseOntology->SNOMEDCTUS20190901 -umlscuiID: C:DiseaseOntology->UMLSCUI -icd10CMID: C:DiseaseOntology->ICD10CM -icd9CMID: C:DiseaseOntology->ICD9CM -orDOID: C:DiseaseOntology->ORDO -gardID: C:DiseaseOntology->GARD +internationalClassificationOfDiseaseID: C:DiseaseOntology->ICDO +medicalSubjectHeadingDescriptorID: C:DiseaseOntology->MESH +nationalCancerInstituteID: C:DiseaseOntology->NCI +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200901 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200301 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20180301 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20190901 +unifiedMedicalLanguageSystemConceptUniqueIdentifier: C:DiseaseOntology->UMLSCUI +icd10CMCode: C:DiseaseOntology->ICD10CM +icd9CMCode: C:DiseaseOntology->ICD9CM +orphaNumber: C:DiseaseOntology->ORDO +geneticAndRareDiseasesID: C:DiseaseOntology->GARD omimID: C:DiseaseOntology->OMIM -efoID: C:DiseaseOntology->EFO +experimentalFactorOntologyID: C:DiseaseOntology->EFO medDraID: C:DiseaseOntology->MEDDRA From 8e6f5cee6165b699bd191930e59ae379d2e40bba Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Fri, 5 Aug 2022 11:19:05 -0500 Subject: [PATCH 13/28] update readme --- scripts/biomedical/diseaseOntology/README.md | 29 ++++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/README.md b/scripts/biomedical/diseaseOntology/README.md index 5636a239c2..968dcab1b2 100644 --- a/scripts/biomedical/diseaseOntology/README.md +++ b/scripts/biomedical/diseaseOntology/README.md @@ -18,21 +18,27 @@ -[Import](#import) ## About the Dataset + [Disease Ontology](https://disease-ontology.org) (DO) is a standardized ontology for human disease that was developed "with the purpose of providing the biomedical community with consistent, reusable and sustainable descriptions of human disease terms, phenotype characteristics and related medical vocabulary disease concepts through collaborative efforts of biomedical researchers, coordinated by the University of Maryland School of Medicine, Institute for Genome Sciences. The Disease Ontology semantically integrates disease and medical vocabularies through extensive cross mapping of DO terms to MeSH, ICD, NCI’s thesaurus, SNOMED and OMIM." ### Download Data -The human disease ontology data can be downloaded from their official github repository [here](https://www.vmh.life/#human/all). The data is in `.owl` format and had to be parsed into a `.csv` format (see [Notes and Caveats](#notes-and-caveats) for additional information on formatting). +The human disease ontology data can be downloaded from their official github repository [here](https://github.com/DiseaseOntology/HumanDiseaseOntology/tree/main/src/ontology). The data is in `.owl` format and had to be parsed into a `.csv` format (see [Notes and Caveats](#notes-and-caveats) for additional information on formatting). One can also download the data by simply running the bash script [`download.sh`](download.sh). ### Overview This directory stores the script used to download, clean, and convert the Disease Ontology data into a `.csv` format, which is ready for ingestion into the Data Commons knowledge graph alongside a `.tmcf` file that maps the `.csv` to the defined schema. In this import the data is ingested as [Disease](https://datacommons.org/browser/Disease) entities into the graph. +The disease ontology ID is mapped to other ontologies, namely ICDO (International Classification of Diseases for Oncology), NCI (National Cancer Institute), SNOWMED ( Systematized Nomenclature of Medicine), UMLSCUI (Unified Medical Language System), ORDO (Orphanet Rare Disease Ontology), GARD (Genetic and Rare Diseases), OMIM (Online Mendelian Inheritance in Man), +EFO (Experimental Factor Ontology), MEDDRA (Medical Dictionary for Regulatory Activities) and MeSH (Medical Subject Headings). + +In addition, the data stores the parent class and alternative IDs for the disease of interest. + ### Notes and Caveats -The original format of the data was `.owl` and it was converted to a `.csv` file prior to ingestion into Data Commons. +The original format of the data was `.owl` and it was converted to a `.csv` file prior to ingestion into Data Commons. One of the key issues encountered during the import was that all other ontologies were grouped under the same tag. So, to divide each ontology into its separate group or column, the prefixes for each ID were used. In addition, the disease description tag was misformatted with various special characteristics that had to be programmatically removed. ### License @@ -46,37 +52,36 @@ This data is under a Creative Commons Public Domain Dedication [CC0 1.0 Universa ##### Shell Script -`download.sh` +[`download.sh`](download.sh) downloads the HumanDO owl file in the scratch directory ##### Python Script -`format_disease_ontology.py` +[`format_disease_ontology.py`](format_disease_ontology.py) parses the .owl file and converts it into a .csv with disease ontology mappings to other ontologies. ##### Test Script -`format_disease_ontology_test.py` +[`disease_ontology_test.py`](disease_ontology_test.py) tests the given script on some test data. #### Files ##### Test File -`input_file.txt` +[`test-do.xml`](test-do.xml) contains test data -`expected_output_file.txt` +[`test-output.csv`](test-output.csv) contains the expected output ##### tMCF File -`my_tmcf_file.tmcf` - +[`disease_ontology.tmcf`](disease_ontology.tmcf) contains the tmcf mapping to the csv file, to generate an accurate tmcf-csv pair. ### Examples #### Run Tests -To test format_refseq_chromosome_id_to_dcid.py run: +To test disease_ontology_test.py run: ``` -python format_disease_ontology.py input_file.owl expected_output.csv +python disease_ontology_test.py unit-tests/test-do.owl unit-tests/test-output.owl ``` #### Import @@ -90,5 +95,5 @@ bash download.sh 2. Clean and convert the downloaded Disease Ontology data into `.csv` format ``` -python format_disease_ontology.py humanDO.owl humanDO.csv +python format_disease_ontology.py HumanDO.owl HumanDO.csv ``` From 377841aa0c3e54faaf89b76632262c524801d020 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Fri, 5 Aug 2022 11:19:48 -0500 Subject: [PATCH 14/28] feat: add download file --- scripts/biomedical/diseaseOntology/download.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 scripts/biomedical/diseaseOntology/download.sh diff --git a/scripts/biomedical/diseaseOntology/download.sh b/scripts/biomedical/diseaseOntology/download.sh new file mode 100644 index 0000000000..a376a7e8f5 --- /dev/null +++ b/scripts/biomedical/diseaseOntology/download.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +mkdir -p scratch; cd scratch +curl -o HumanDO.owl https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/HumanDO.owl From 15cdeb128b7d4ce7f21f1443ef9c191e63578ec4 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Fri, 5 Aug 2022 11:20:45 -0500 Subject: [PATCH 15/28] add function edits to the script --- .../format_disease_ontology.py | 135 ++++++------------ 1 file changed, 44 insertions(+), 91 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 3e966bf0f5..13b438c65e 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -1,4 +1,4 @@ -# Copyright 2021 Google LLC +# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,7 +13,7 @@ # limitations under the License. """ Author: Suhana Bedi -Date: 08/03/2021 +Date: 08/05/2022 Name: format_disease_ontology Description: converts a .owl disease ontology file into a csv format, creates dcids for each disease and links the dcids @@ -28,8 +28,6 @@ import pandas as pd import re import numpy as np -import datacommons as dc -import csv import sys @@ -97,9 +95,10 @@ def format_cols(df): df[col] = df[col].astype(str) df[col] = df[col].map(lambda x: re.sub(r'[\([{})\]]', '', x)) df.iloc[:, i] = df.iloc[:, i].str.replace("'", '') - df.iloc[:, i] = df.iloc[:, i].str.replace('"', '') df[col] = df[col].replace('nan', np.nan) df['id'] = df['id'].str.replace(':', '_') + df['hasAlternativeId'] = df['hasAlternativeId'].str.replace(':', '_') + def col_explode(df): @@ -120,6 +119,9 @@ def col_explode(df): df[newcol] = np.nan df[newcol] = np.where(df['A'] == newcol, df['B'], np.nan) df[newcol] = df[newcol].astype(str).replace("nan", np.nan) + df['hasAlternativeId'] = df['hasAlternativeId'].str.split(',') + df = df.explode('hasAlternativeId') + return df @@ -143,6 +145,22 @@ def shard(list_to_shard, shard_size): sharded_list.append(arr) return sharded_list +def mesh_separator(df): + """ + Splits the mesh column into mesh descriptor and concept ID + Args: + df: pandas dataframe with mesh column + + Returns: + df: dataframe with split mesh column + """ + + df['meshDescriptor'] = np.where(df['MESH'].str[0] == 'D', df['MESH'], np.nan) + df['meshDescriptor'] = "bio/" + df['meshDescriptor'] + df['meshConcept'] = np.where(df['MESH'].str[0] == 'C', df['MESH'], np.nan) + df['meshConcept'] = "bio/" + df['meshConcept'] + df = df.drop(['MESH'], axis = 1) + return df def col_string(df): """ @@ -152,91 +170,30 @@ def col_string(df): Returns: None """ - col_add = list(df['A'].unique()) - for newcol in col_add: - df[newcol] = str(newcol) + ":" + df[newcol].astype(str) - col_rep = str(newcol) + ":" + "nan" - df[newcol] = df[newcol].replace(col_rep, np.nan) - col_names = [ - 'hasAlternativeId', 'hasExactSynonym', 'label', 'ICDO', 'MESH', 'NCI', - 'SNOMEDCTUS20200901', 'ICD10CM', 'ICD9CM', - 'SNOMEDCTUS20200301', 'ORDO', 'SNOMEDCTUS20180301', 'GARD', 'OMIM', - 'EFO', 'KEGG', 'MEDDRA', 'SNOMEDCTUS20190901' - ] + col_names = ['hasExactSynonym', 'label', 'IAO_0000115'] for col in col_names: df.update('"' + df[[col]].astype(str) + '"') - df['UMLSCUI'] = df['UMLSCUI'].astype(str) - df['UMLSCUI'] = df['UMLSCUI'].str.split(':').str[-1] - df['ICD9CM'] = df['ICD9CM'].astype(str) - df['ICD9CM'] = df['ICD9CM'].str.split(':').str[-1] - - -def mesh_query(df): - """ - Queries the MESH ids present in the dataframe, - on datacommons, fetches their dcids and adds - it to the same column. - Args: - df = dataframe to change - Returns - df = modified dataframe with MESH dcid added - """ - df_temp = df[df.MESH.notnull()] - list_mesh = list(df_temp['MESH']) - arr_mesh = shard(list_mesh, 1000) - for i in range(len(arr_mesh)): - query_str = """ - SELECT DISTINCT ?id ?element_name - WHERE {{ - ?element typeOf MeSHDescriptor . - ?element dcid ?id . - ?element descriptorID ?element_name . - ?element descriptorID {0} . - }} - """.format(arr_mesh[i]) - result = dc.query(query_str) - result_df = pd.DataFrame(result) - result_df.columns = ['MESH_id', 'element_name'] - df = pd.merge(result_df, df, left_on='element_name', right_on='MESH', how = "right") + df[col] = df[col].replace(["\"nan\""],np.nan) return df - -def icd10_query(df): - """ - Queries the ICD10 ids present in the dataframe, - on datacommons, fetches their dcids and adds - it to the same column. - Args: - df = dataframe to change - Returns - df = modified dataframe with ICD dcid added - """ - df_temp = df[df.ICD10CM.notnull()] - list_icd10 = "ICD10/" + df_temp['ICD10CM'].astype(str) - arr_icd10 = shard(list_icd10, 1000) - for i in range(len(arr_icd10)): - query_str = """ - SELECT DISTINCT ?id - WHERE {{ - ?element typeOf ICD10Code . - ?element dcid ?id . - ?element dcid {0} . - }} - """.format(arr_icd10[i]) - result1 = dc.query(query_str) - result1_df = pd.DataFrame(result1) - result1_df['element'] = result1_df['?id'].str.split(pat="/").str[1] - result1_df.columns = ['ICD10_id', 'element'] - df = pd.merge(result1_df, df, left_on='element', right_on='ICD10CM', how = "right") - return df - - def remove_newline(df): df.loc[2505, 'IAO_0000115'] = df.loc[2505, 'IAO_0000115'].replace("\\n", "") df.loc[2860, 'IAO_0000115'] = df.loc[2860, 'IAO_0000115'].replace("\\n", "") df.loc[2895, 'IAO_0000115'] = df.loc[2895, 'IAO_0000115'].replace("\\n", "") df.loc[2934, 'IAO_0000115'] = df.loc[2934, 'IAO_0000115'].replace("\\n", "") df.loc[3036, 'IAO_0000115'] = df.loc[3036, 'IAO_0000115'].replace("\\n", "") + df.loc[11305, 'IAO_0000115'] = df.loc[11305, + 'IAO_0000115'].replace("\\n", "") + return df + +def create_dcid(df): + ##df[newcol] = np.where(df['A'] == newcol, df['B'], np.nan) + col_names = ['id', 'subClassOf', 'hasAlternativeId'] + for col in col_names: + df[col] = "bio/" + df[col] + df[col] = df[col].replace(["bio/nan"],np.nan) + df['ICD10CM'] = "ICD10/" + df['ICD9CM'].astype(str) + df['ICD10CM'] = df["ICD10/"].replace(["ICD10/nan"], np.nan) return df @@ -263,27 +220,23 @@ def wrapper_fun(file_input): ], axis=1) df_do = col_explode(df_do) - df_do = mesh_query(df_do) - df_do = icd10_query(df_do) - col_string(df_do) + df_do = mesh_separator(df_do) + df_do = col_string(df_do) df_do = df_do.drop(['A', 'B', 'nan', 'hasDbXref', 'KEGG'], axis=1) df_do = df_do.drop_duplicates(subset='id', keep="last") df_do = df_do.reset_index(drop=True) df_do = df_do.replace('"nan"', np.nan) - df_do = df_do.replace("nan", np.nan) - #generate dcids - df_do['id'] = "bio/" + df_do['id'] - df_do['subClassOf'] = "bio/" + df_do['subClassOf'] - df_do['ICD9CM'] = "ICD10/" + df_do['ICD9CM'].apply(str) + df_do = create_dcid(df_do) + df_do = remove_newline(df_do) + df_do['IAO_0000115'] = df_do['IAO_0000115'].str.replace("_", " ") return df_do def main(): file_input = sys.argv[1] file_output = sys.argv[2] df = wrapper_fun(file_input) - df = remove_newline(df) - df['IAO_0000115'] = df['IAO_0000115'].str.replace("_", " ") - df.to_csv(file_output) + df.columns = ['diseaseDescription' if x=='IAO_0000115' else x for x in df.columns] + df.to_csv(file_output, doublequote=False, escapechar='\\') if __name__ == '__main__': From 370a2e522fbf1ad86d9b73c13ebceaca5c271934 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 19 Sep 2022 18:30:52 -0500 Subject: [PATCH 16/28] fix: ICD10 formatting --- scripts/biomedical/diseaseOntology/format_disease_ontology.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 13b438c65e..60fbe0436f 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -192,8 +192,8 @@ def create_dcid(df): for col in col_names: df[col] = "bio/" + df[col] df[col] = df[col].replace(["bio/nan"],np.nan) - df['ICD10CM'] = "ICD10/" + df['ICD9CM'].astype(str) - df['ICD10CM'] = df["ICD10/"].replace(["ICD10/nan"], np.nan) + df['ICD10CM'] = "ICD10/" + df['ICD10CM'].astype(str) + df['ICD10CM'] = df['ICD10CM'].replace("ICD10/nan", np.nan) return df From 75dc2d696abc245561f1986ad7dbd2224fc01cf4 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 19 Sep 2022 18:46:08 -0500 Subject: [PATCH 17/28] feat: update tmcf --- .../diseaseOntology/disease_ontology.tmcf | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf index c03152bfa7..684b861ddb 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -1,18 +1,19 @@ Node: E:DiseaseOntology->E1 typeOf: dcs:Disease -dcid: C:DiseaseOntology->dcid +dcid: C:DiseaseOntology->id name: C:DiseaseOntology->label parent: C:DiseaseOntology->subClassOf -description: C:DiseaseOntology->IAO_0000115 +description: C:DiseaseOntology->diseaseDescription alternativeDiseaseOntologyID : C:DiseaseOntology->hasAlternativeId diseaseSynonym: C:DiseaseOntology->hasExactSynonym internationalClassificationOfDiseaseID: C:DiseaseOntology->ICDO -medicalSubjectHeadingDescriptorID: C:DiseaseOntology->MESH +medicalSubjectHeadingDescriptorID: C:DiseaseOntology->meshDescriptor +medicalSubjectHeadingConceptID: C:DiseaseOntology->meshConcept nationalCancerInstituteID: C:DiseaseOntology->NCI -snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200901 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20210731 snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200301 -snowmedCT: C:DiseaseOntology->SNOMEDCTUS20180301 -snowmedCT: C:DiseaseOntology->SNOMEDCTUS20190901 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200901 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20220630 unifiedMedicalLanguageSystemConceptUniqueIdentifier: C:DiseaseOntology->UMLSCUI icd10CMCode: C:DiseaseOntology->ICD10CM icd9CMCode: C:DiseaseOntology->ICD9CM @@ -21,3 +22,4 @@ geneticAndRareDiseasesID: C:DiseaseOntology->GARD omimID: C:DiseaseOntology->OMIM experimentalFactorOntologyID: C:DiseaseOntology->EFO medDraID: C:DiseaseOntology->MEDDRA + From e783ba93e8bf95b3243c42c312b9111799d5af49 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 19 Sep 2022 18:47:21 -0500 Subject: [PATCH 18/28] fix: line number for formatting --- scripts/biomedical/diseaseOntology/format_disease_ontology.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 60fbe0436f..b2ce77a570 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -182,7 +182,7 @@ def remove_newline(df): df.loc[2895, 'IAO_0000115'] = df.loc[2895, 'IAO_0000115'].replace("\\n", "") df.loc[2934, 'IAO_0000115'] = df.loc[2934, 'IAO_0000115'].replace("\\n", "") df.loc[3036, 'IAO_0000115'] = df.loc[3036, 'IAO_0000115'].replace("\\n", "") - df.loc[11305, 'IAO_0000115'] = df.loc[11305, + df.loc[11305, 'IAO_0000115'] = df.loc[11304, 'IAO_0000115'].replace("\\n", "") return df From 178878412c93b0042a801d68bf4aef934a6e5daa Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Mon, 19 Sep 2022 21:37:17 -0700 Subject: [PATCH 19/28] Update disease_ontology.tmcf --- scripts/biomedical/diseaseOntology/disease_ontology.tmcf | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf index 684b861ddb..592a21cd2f 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -2,9 +2,9 @@ Node: E:DiseaseOntology->E1 typeOf: dcs:Disease dcid: C:DiseaseOntology->id name: C:DiseaseOntology->label -parent: C:DiseaseOntology->subClassOf +specializationOf: C:DiseaseOntology->subClassOf description: C:DiseaseOntology->diseaseDescription -alternativeDiseaseOntologyID : C:DiseaseOntology->hasAlternativeId +alternativeDiseaseOntologyID: C:DiseaseOntology->hasAlternativeId diseaseSynonym: C:DiseaseOntology->hasExactSynonym internationalClassificationOfDiseaseID: C:DiseaseOntology->ICDO medicalSubjectHeadingDescriptorID: C:DiseaseOntology->meshDescriptor @@ -19,7 +19,6 @@ icd10CMCode: C:DiseaseOntology->ICD10CM icd9CMCode: C:DiseaseOntology->ICD9CM orphaNumber: C:DiseaseOntology->ORDO geneticAndRareDiseasesID: C:DiseaseOntology->GARD -omimID: C:DiseaseOntology->OMIM +onlineMendelianInheritanceInManID: C:DiseaseOntology->OMIM experimentalFactorOntologyID: C:DiseaseOntology->EFO medDraID: C:DiseaseOntology->MEDDRA - From 3db959acb1ee38154c32dc2f2698782fe3c5e2d3 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Tue, 20 Sep 2022 15:32:10 -0500 Subject: [PATCH 20/28] fix: column formatting --- scripts/biomedical/diseaseOntology/format_disease_ontology.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index b2ce77a570..8db4f4db33 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -172,6 +172,7 @@ def col_string(df): """ col_names = ['hasExactSynonym', 'label', 'IAO_0000115'] for col in col_names: + df[col] = df[col].str.replace('"', "") df.update('"' + df[[col]].astype(str) + '"') df[col] = df[col].replace(["\"nan\""],np.nan) return df From c3eac4af807bdb6f8452db5c8f98769a545b28e6 Mon Sep 17 00:00:00 2001 From: Samantha Piekos Date: Tue, 20 Sep 2022 15:49:46 -0700 Subject: [PATCH 21/28] Update disease_ontology.tmcf --- scripts/biomedical/diseaseOntology/disease_ontology.tmcf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf index 592a21cd2f..64a36a24ee 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -6,7 +6,7 @@ specializationOf: C:DiseaseOntology->subClassOf description: C:DiseaseOntology->diseaseDescription alternativeDiseaseOntologyID: C:DiseaseOntology->hasAlternativeId diseaseSynonym: C:DiseaseOntology->hasExactSynonym -internationalClassificationOfDiseaseID: C:DiseaseOntology->ICDO +internationalClassificationOfDiseasesOntologyID: C:DiseaseOntology->ICDO medicalSubjectHeadingDescriptorID: C:DiseaseOntology->meshDescriptor medicalSubjectHeadingConceptID: C:DiseaseOntology->meshConcept nationalCancerInstituteID: C:DiseaseOntology->NCI From 526266f2d7e18ee3ba02f8d2beab3be509502472 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Thu, 22 Sep 2022 09:23:13 -0500 Subject: [PATCH 22/28] add diseaseID column --- .../biomedical/diseaseOntology/disease_ontology.tmcf | 1 + .../diseaseOntology/format_disease_ontology.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf index 64a36a24ee..4aed4f5bdd 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -2,6 +2,7 @@ Node: E:DiseaseOntology->E1 typeOf: dcs:Disease dcid: C:DiseaseOntology->id name: C:DiseaseOntology->label +diseaseOntologyID: C:DiseaseOntology->diseaseId specializationOf: C:DiseaseOntology->subClassOf description: C:DiseaseOntology->diseaseDescription alternativeDiseaseOntologyID: C:DiseaseOntology->hasAlternativeId diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 8db4f4db33..af506a4348 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -183,18 +183,19 @@ def remove_newline(df): df.loc[2895, 'IAO_0000115'] = df.loc[2895, 'IAO_0000115'].replace("\\n", "") df.loc[2934, 'IAO_0000115'] = df.loc[2934, 'IAO_0000115'].replace("\\n", "") df.loc[3036, 'IAO_0000115'] = df.loc[3036, 'IAO_0000115'].replace("\\n", "") - df.loc[11305, 'IAO_0000115'] = df.loc[11304, - 'IAO_0000115'].replace("\\n", "") + df.loc[11304, 'IAO_0000115'] = df.loc[11304, 'IAO_0000115'].replace("\\n", "") return df def create_dcid(df): - ##df[newcol] = np.where(df['A'] == newcol, df['B'], np.nan) + df['diseaseId'] = df['id'] + df['diseaseId'] = df['diseaseId'].str.replace("_", ":") col_names = ['id', 'subClassOf', 'hasAlternativeId'] for col in col_names: df[col] = "bio/" + df[col] df[col] = df[col].replace(["bio/nan"],np.nan) df['ICD10CM'] = "ICD10/" + df['ICD10CM'].astype(str) df['ICD10CM'] = df['ICD10CM'].replace("ICD10/nan", np.nan) + df.update('"' + df[['diseaseId']].astype(str) + '"') return df @@ -245,3 +246,7 @@ def main(): + + + + From 10bf338eb9d25711dd4a2ff1154fa20bc9cf97c4 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 26 Sep 2022 17:09:40 -0500 Subject: [PATCH 23/28] fix column formatting --- scripts/biomedical/diseaseOntology/format_disease_ontology.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index af506a4348..6fec0ad8c3 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -189,6 +189,7 @@ def remove_newline(df): def create_dcid(df): df['diseaseId'] = df['id'] df['diseaseId'] = df['diseaseId'].str.replace("_", ":") + df['hasAlternativeId'] = df['hasAlternativeId'].str.strip() col_names = ['id', 'subClassOf', 'hasAlternativeId'] for col in col_names: df[col] = "bio/" + df[col] From 010be38f8e26c2a6f0c97317f1b26649458f1e8a Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 24 Oct 2022 13:03:59 -0500 Subject: [PATCH 24/28] fix unit tests --- .../diseaseOntology/disease_ontology_test.py | 36 +++++++++++++++++-- .../biomedical/diseaseOntology/download.sh | 2 +- .../unit-tests/test-expected.csv | 25 +++++++++++++ 3 files changed, 60 insertions(+), 3 deletions(-) create mode 100644 scripts/biomedical/diseaseOntology/unit-tests/test-expected.csv diff --git a/scripts/biomedical/diseaseOntology/disease_ontology_test.py b/scripts/biomedical/diseaseOntology/disease_ontology_test.py index 45bc4eb9d0..733e530e5a 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology_test.py +++ b/scripts/biomedical/diseaseOntology/disease_ontology_test.py @@ -29,8 +29,40 @@ class TestParseMesh(unittest.TestCase): def test_main(self): """Test in the main function""" # Read in the expected output files into pandas dataframes - df1_expected = pd.read_csv('unit-tests/test-output.csv') - df_actual = wrapper_fun('unit-tests/test-do.xml') + df1_expected = pd.read_csv('unit-tests/test-expected.csv') + tree = ElementTree.parse('unit-tests/test-do.xml') + # Get file root + root = tree.getroot() + # Find owl classes elements + all_classes = root.findall('{http://www.w3.org/2002/07/owl#}Class') + # Parse owl classes to human-readble dictionary format + parsed_owl_classes = [] + for owl_class in all_classes: + info = list(owl_class.iter()) + parsed_owl_classes.append(parse_do_info(info)) + # Convert to pandas Dataframe + df_do = pd.DataFrame(parsed_owl_classes) + format_cols(df_do) + df_do = df_do.drop([ + 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', + 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', + 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', + 'inSubset', 'hasOBONamespace' + ], + axis=1) + df_do = col_explode(df_do) + df_do = mesh_separator(df_do) + df_do = col_string(df_do) + df_do = df_do.drop(['A', 'B', 'nan', 'hasDbXref', 'KEGG'], axis=1) + df_do = df_do.drop_duplicates(subset='id', keep="last") + df_do = df_do.reset_index(drop=True) + df_do = df_do.replace('"nan"', np.nan) + df_do['IAO_0000115'] = df_do['IAO_0000115'].str.replace("_", " ") + df_actual = df_do.applymap(lambda x: x.replace('"', '') if (isinstance(x, str)) else x) + ## fixes the float to object type conversion by pandas while reading the dataframe + col_list = ['SNOMEDCTUS20200901', 'SNOMEDCTUS20180301', 'SNOMEDCTUS20200301', 'SNOMEDCTUS20190901', 'OMIM', 'ORDO'] + for i in col_list: + df_actual[i] = df_actual[i].astype(float) # Run all the functions in format_mesh.py # Compare expected and actual output files assert_frame_equal(df1_expected.reset_index(drop=True), df_actual.reset_index(drop=True)) diff --git a/scripts/biomedical/diseaseOntology/download.sh b/scripts/biomedical/diseaseOntology/download.sh index a376a7e8f5..4df11e338d 100644 --- a/scripts/biomedical/diseaseOntology/download.sh +++ b/scripts/biomedical/diseaseOntology/download.sh @@ -1,4 +1,4 @@ #!/bin/bash mkdir -p scratch; cd scratch -curl -o HumanDO.owl https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/HumanDO.owl +curl -o HumanDO.owl https://raw.githubusercontent.com/DiseaseOntology/HumanDiseaseOntology/main/src/ontology/HumanDO.owl \ No newline at end of file diff --git a/scripts/biomedical/diseaseOntology/unit-tests/test-expected.csv b/scripts/biomedical/diseaseOntology/unit-tests/test-expected.csv new file mode 100644 index 0000000000..658bf22e92 --- /dev/null +++ b/scripts/biomedical/diseaseOntology/unit-tests/test-expected.csv @@ -0,0 +1,25 @@ +subClassOf,IAO_0000115,hasAlternativeId,hasExactSynonym,id,label,ICDO,NCI,SNOMEDCTUS20200901,UMLSCUI,ICD9CM,SNOMEDCTUS20200301,ICD10CM,SNOMEDCTUS20180301,GARD,OMIM,ORDO,EFO,MEDDRA,SNOMEDCTUS20190901,meshDescriptor,meshConcept +DOID_175,A vascular cancer that derives from the cells that line the walls of blood vessels or lymphatic vessels.," DOID_4508",hemangiosarcoma,DOID_0001816,angiosarcoma,,,,C0854893,,,,,,,,,,,, +DOID_0060005,An autoimmune disease of endocrine system that is located in the pancreas.,,,DOID_0040091,autoimmune pancreatitis,,,,C2609129,,,,,,,,,,,, +,,,,DOID_0050001,obsolete Actinomadura madurae infectious disease,,,,,,,,,,,,,,,, +DOID_4450,,,"renal cell carcinoma, spindle cell",DOID_4473,sarcomatoid renal cell carcinoma,,,,C1266043,,,,,,,,,,,, +DOID_104,A bacterial infectious disease has material basis in Bacteria.,,,DOID_0040085,bacterial sepsis,,,,,,10001005,,,,,,,,,, +DOID_9182,A pemphigus that is characterized by blistered skin as a result of self-reactive T and B cells that target BP180.,,,DOID_0040098,pemphigus gestationis,,,,,,,,86081009,,,,,,,, +DOID_0040021,A cephalosporin allergy that has allergic trigger ceftriaxone.,,rocephin allergy,DOID_0040005,ceftriaxone allergy,,,,C0571463,,,,,,,,,,,, +DOID_11104,"A spotted fever that has material basis in Rickettsia conorii subsp israelensis, which is transmitted by ticks Rhipicephalus sanguineus. The infection has symptom fever, has symptom eschar, has symptom regional adenopathy, and has symptom maculopapular rash on extremities.",,,DOID_0050043,Israeli tick typhus,,,,,,,,,,,,,,,, +DOID_0060500,A drug allergy that has allergic trigger carbamazepine.,,"Tegretol allergy, carbamazepen allergy",DOID_0040006,carbamazepine allergy,,,,C0570787,,,,,,,,,,,, +DOID_2914,"An immune system disease that is an exaggerated immune response to allergens, such as insect venom, dust mites, pollen, pet dander, drugs or some foods.",,"allergic hypersensitivity disease, hypersensitivity, hypersensitivity reaction type I disease",DOID_1205,allergic disease,,,,C0020517,,,,,,,,,,,, +DOID_423,A myopathy that is characterized by the presence of tubular aggregates in myofibrils and has material basis in heterozygous mutation in the STIM1 gene on chromosome 11p15.,,,DOID_0080089,tubular aggregate myopathy 1,,,,,,,,,,160565,,,,,, +DOID_9562,"A primary ciliary dyskinesia that is characterized by sinusitis, bronchiectasis and situs inversus with dextrocardia resulting from dysfunction of the cilia during embryologic development.",,Kartageners syndrome,DOID_0050144,Kartagener syndrome,,,,,,42402006,,,,,,,,,, +DOID_10154,A small intestine cancer that has material basis in cells of the neuroendocrine system.,,,DOID_0050925,small intestine carcinoid neuroendocrine tumor,,,,,,,,,,114900,,,,,, +DOID_3770,A pulmonary fibrosis that is characterized by scarring of the lung.,,"FIBROCYSTIC PULMONARY DYSPLASIA, IDIOPATHIC PULMONARY FIBROSIS, FAMILIAL, cryptogenic fibrosing alveolitis",DOID_0050156,idiopathic pulmonary fibrosis,,,,C1800706,,,,,,,,,,,, +DOID_9351,"A diabetes mellitus that has material basis in autosomal dominant inheritance of mutations in the MODY genes impacting beta-cell function, typically occurring before 25 years of age and caused by primary insulin secretion defects.",,"MODY, Mason-type diabetes",DOID_0050524,maturity-onset diabetes of the young,,,,,,,,,,,552,,,,, +DOID_8778,"An inflammatory bowel disease characterized by inflammation located in ileum, has symptom diarrhea, has symptom abdominal pain, often in the right lower quadrant, has symptom weight loss.",,Crohns ileitis,DOID_0060189,ileitis,,,,C0020877,,,,,,,,,,,, +DOID_4090,An agnosia that is a loss of the ability to map out physical actions in order to repeat them in functional activities.,,,DOID_0060135,apraxia,,,,,,,,,,,,,,68345001,, +DOID_0060225,A 3MC syndrome that has material basis in autosomal recessive inheritance of homozygous mutation in the collectin subfamily member 11 gene COLEC11 on chromosome 2p25.,,,DOID_0060576,3MC syndrome 2,,,,,,,,,,265050,,,,,, +DOID_399,A tuberculosis located in the heart.,,,DOID_0060570,cardiac tuberculosis,,,,,,,,,,,,,,,bio/D014381, +DOID_10286,A prostate carcinoma that is characterized by continued growth and spread despite the surgical removal of the testes or medical intervention to block androgen production.,,,DOID_0080909,castration-resistant prostate carcinoma,,C130234,,,,,,,,,,,,,, +DOID_10124,A corneal disease that is characterized by a triangular tissue growth located in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation.,,surfers eye,DOID_0002116,pterygium,,,,C0033999,,,,,,,,,,,, +DOID_4,A disease that involving errors in metabolic processes of building or degradation of molecules.,,metabolic disease,DOID_0014667,disease of metabolism,,,,C0025517,,,,,,,,,,,, +DOID_4450,A renal cell carcinoma that has material basis in cells that appear very pale or clear when examined under microscope.,,"Clear cell carcinoma of kidney, clear cell kidney carcinoma, conventional Clear cell renal cell carcinoma, conventional renal cell carcinoma, renal clear cell carcinoma",DOID_4467,clear cell renal cell carcinoma,,,,C0279702,,,,,,,,,,,, +DOID_0060524,A crustacean allergy that has allergic trigger shrimp.,,,DOID_0040001,shrimp allergy,,,,,,,,,,,,,,,, \ No newline at end of file From 03926bc1225fbf325f8c387a5ed09e7cb5490246 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 24 Oct 2022 13:04:36 -0500 Subject: [PATCH 25/28] remove old test file --- .../unit-tests/test-output.csv | 25 ------------------- 1 file changed, 25 deletions(-) delete mode 100644 scripts/biomedical/diseaseOntology/unit-tests/test-output.csv diff --git a/scripts/biomedical/diseaseOntology/unit-tests/test-output.csv b/scripts/biomedical/diseaseOntology/unit-tests/test-output.csv deleted file mode 100644 index cc3f2eff07..0000000000 --- a/scripts/biomedical/diseaseOntology/unit-tests/test-output.csv +++ /dev/null @@ -1,25 +0,0 @@ -ICD10_id,element,MESH_id,element_name,subClassOf,IAO_0000115,hasAlternativeId,hasExactSynonym,id,label,ICDO,MESH,NCI,SNOMEDCTUS20200901,UMLSCUI,ICD9CM,SNOMEDCTUS20200301,ICD10CM,SNOMEDCTUS20180301,GARD,OMIM,ORDO,EFO,MEDDRA,SNOMEDCTUS20190901 -,,,,bio/DOID_175,A vascular cancer that derives_from the cells that line the walls of blood vessels or lymphatic vessels.,"""DOID:267, DOID:4508""","""hemangiosarcoma""",bio/DOID_0001816,"""angiosarcoma""",,,,,C0854893,ICD10/nan,,,,,,,,, -,,,,bio/DOID_0060005,An autoimmune disease of endocrine system that is located_in the pancreas.,,,bio/DOID_0040091,"""autoimmune pancreatitis""",,,,,C2609129,ICD10/nan,,,,,,,,, -,,,,,,,,bio/DOID_0050001,"""obsolete Actinomadura madurae infectious disease""",,,,,,ICD10/nan,,,,,,,,, -,,,,bio/DOID_4450,,,"""renal cell carcinoma, spindle cell""",bio/DOID_4473,"""sarcomatoid renal cell carcinoma""",,,,,C1266043,ICD10/nan,,,,,,,,, -,,,,bio/DOID_104,A bacterial infectious disease has_material_basis_in Bacteria.,,,bio/DOID_0040085,"""bacterial sepsis""",,,,,,ICD10/nan,"""SNOMEDCTUS20200301:10001005""",,,,,,,, -,,,,bio/DOID_9182,A pemphigus that is characterized by blistered skin as a result of self-reactive T and B cells that target BP180.,,,bio/DOID_0040098,"""pemphigus gestationis""",,,,,,ICD10/nan,,,"""SNOMEDCTUS20180301:86081009""",,,,,, -,,,,bio/DOID_0040021,A cephalosporin allergy that has_allergic_trigger ceftriaxone.,,"""rocephin allergy""",bio/DOID_0040005,"""ceftriaxone allergy""",,,,,C0571463,ICD10/nan,,,,,,,,, -,,,,bio/DOID_11104,"A spotted fever that has_material_basis_in Rickettsia conorii subsp israelensis, which is transmitted_by ticks Rhipicephalus sanguineus. The infection has_symptom fever, has_symptom eschar, has_symptom regional adenopathy, and has_symptom maculopapular rash on extremities.",,,bio/DOID_0050043,"""Israeli tick typhus""",,,,,,ICD10/nan,,,,,,,,, -,,,,bio/DOID_0060500,A drug allergy that has_allergic_trigger carbamazepine.,,"""Tegretol allergy, carbamazepen allergy""",bio/DOID_0040006,"""carbamazepine allergy""",,,,,C0570787,ICD10/nan,,,,,,,,, -,,,,bio/DOID_2914,"An immune system disease that is an exaggerated immune response to allergens, such as insect venom, dust mites, pollen, pet dander, drugs or some foods.",,"""allergic hypersensitivity disease, hypersensitivity, hypersensitivity reaction type I disease""",bio/DOID_1205,"""allergic disease""",,,,,C0020517,ICD10/nan,,,,,,,,, -,,,,bio/DOID_423,A myopathy that is characterized by the presence of tubular aggregates in myofibrils and has_material_basis_in heterozygous mutation in the STIM1 gene on chromosome 11p15.,,,bio/DOID_0080089,"""tubular aggregate myopathy 1""",,,,,,ICD10/nan,,,,,"""OMIM:160565""",,,, -,,,,bio/DOID_9562,"A primary ciliary dyskinesia that is characterized by sinusitis, bronchiectasis and situs inversus with dextrocardia resulting from dysfunction of the cilia during embryologic development.",,"""Kartageners syndrome""",bio/DOID_0050144,"""Kartagener syndrome""",,,,,,ICD10/nan,"""SNOMEDCTUS20200301:42402006""",,,,,,,, -,,,,bio/DOID_10154,A small intestine cancer that has_material_basis_in cells of the neuroendocrine system.,,,bio/DOID_0050925,"""small intestine carcinoid neuroendocrine tumor""",,,,,,ICD10/nan,,,,,"""OMIM:114900""",,,, -,,,,bio/DOID_3770,A pulmonary fibrosis that is characterized by scarring of the lung.,,"""FIBROCYSTIC PULMONARY DYSPLASIA, IDIOPATHIC PULMONARY FIBROSIS, FAMILIAL, cryptogenic fibrosing alveolitis""",bio/DOID_0050156,"""idiopathic pulmonary fibrosis""",,,,,C1800706,ICD10/nan,,,,,,,,, -,,,,bio/DOID_9351,"A diabetes mellitus that has_material_basis_in autosomal dominant inheritance of mutations in the MODY genes impacting beta-cell function, typically occurring before 25 years of age and caused by primary insulin secretion defects.",,"""MODY, Mason-type diabetes""",bio/DOID_0050524,"""maturity-onset diabetes of the young""",,,,,,ICD10/nan,,,,,,"""ORDO:552""",,, -,,,,bio/DOID_8778,"An inflammatory bowel disease characterized by inflammation located_in ileum, has_symptom diarrhea, has_symptom abdominal pain, often in the right lower quadrant, has_symptom weight loss.",,"""Crohns ileitis""",bio/DOID_0060189,"""ileitis""",,,,,C0020877,ICD10/nan,,,,,,,,, -,,,,bio/DOID_4090,An agnosia that is a loss of the ability to map out physical actions in order to repeat them in functional activities.,,,bio/DOID_0060135,"""apraxia""",,,,,,ICD10/nan,,,,,,,,,"""SNOMEDCTUS20190901:68345001""" -,,,,bio/DOID_0060225,A 3MC syndrome that has_material_basis_in autosomal recessive inheritance of homozygous mutation in the collectin subfamily member 11 gene COLEC11 on chromosome 2p25.,,,bio/DOID_0060576,"""3MC syndrome 2""",,,,,,ICD10/nan,,,,,"""OMIM:265050""",,,, -,,bio/D014381,D014381,bio/DOID_399,A tuberculosis located in the heart.,,,bio/DOID_0060570,"""cardiac tuberculosis""",,"""MESH:D014381""",,,,ICD10/nan,,,,,,,,, -,,,,bio/DOID_10286,A prostate carcinoma that is characterized by continued growth and spread despite the surgical removal of the testes or medical intervention to block androgen production.,,,bio/DOID_0080909,"""castration-resistant prostate carcinoma""",,,"""NCI:C130234""",,,ICD10/nan,,,,,,,,, -,,,,bio/DOID_10124,A corneal disease that is characterized by a triangular tissue growth located_in cornea of the eye that is the result of collagen degeneration and fibrovascular proliferation.,,"""surfers eye""",bio/DOID_0002116,"""pterygium""",,,,,C0033999,ICD10/nan,,,,,,,,, -,,,,bio/DOID_4,A disease that involving errors in metabolic processes of building or degradation of molecules.,,"""metabolic disease""",bio/DOID_0014667,"""disease of metabolism""",,,,,C0025517,ICD10/nan,,,,,,,,, -,,,,bio/DOID_4450,A renal cell carcinoma that has_material_basis_in cells that appear very pale or clear when examined under microscope.,,"""Clear cell carcinoma of kidney, clear cell kidney carcinoma, conventional Clear cell renal cell carcinoma, conventional renal cell carcinoma, renal clear cell carcinoma""",bio/DOID_4467,"""clear cell renal cell carcinoma""",,,,,C0279702,ICD10/nan,,,,,,,,, -,,,,bio/DOID_0060524,A crustacean allergy that has_allergic_trigger shrimp.,,,bio/DOID_0040001,"""shrimp allergy""",,,,,,ICD10/nan,,,,,,,,, \ No newline at end of file From caa3e3e665fbe3b26752dc2df11b8921b63c4eb4 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Wed, 1 Feb 2023 18:16:26 -0600 Subject: [PATCH 26/28] feat: add missing synonyms for disease terms --- .../diseaseOntology/disease_ontology.tmcf | 9 +- .../format_disease_ontology.py | 384 +++++++++--------- 2 files changed, 210 insertions(+), 183 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf index 4aed4f5bdd..91b0a167e4 100644 --- a/scripts/biomedical/diseaseOntology/disease_ontology.tmcf +++ b/scripts/biomedical/diseaseOntology/disease_ontology.tmcf @@ -11,11 +11,16 @@ internationalClassificationOfDiseasesOntologyID: C:DiseaseOntology->ICDO medicalSubjectHeadingDescriptorID: C:DiseaseOntology->meshDescriptor medicalSubjectHeadingConceptID: C:DiseaseOntology->meshConcept nationalCancerInstituteID: C:DiseaseOntology->NCI +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20220901 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20210901 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20221231 snowmedCT: C:DiseaseOntology->SNOMEDCTUS20210731 -snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200301 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20220731 snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200901 -snowmedCT: C:DiseaseOntology->SNOMEDCTUS20220630 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20220301 +snowmedCT: C:DiseaseOntology->SNOMEDCTUS20200301 unifiedMedicalLanguageSystemConceptUniqueIdentifier: C:DiseaseOntology->UMLSCUI +icd11CMCode: C:DiseaseOntology->ICD11 icd10CMCode: C:DiseaseOntology->ICD10CM icd9CMCode: C:DiseaseOntology->ICD9CM orphaNumber: C:DiseaseOntology->ORDO diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 6fec0ad8c3..4dc387ff74 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -32,218 +32,240 @@ def format_tag(tag: str) -> str: - """Extract human-readable tag from xml tag - Args: - tag: tag of an element in xml file, - containg human-readable string after '}' - Returns: - tag_readable: human-readble string after '}' - - """ - tag_readable = tag.split("}")[1] - return tag_readable + """Extract human-readable tag from xml tag + Args: + tag: tag of an element in xml file, + containg human-readable string after '}' + Returns: + tag_readable: human-readble string after '}' + + """ + tag_readable = tag.split("}")[1] + return tag_readable def format_attrib(attrib: dict) -> str: - """Extract text from xml attributes dictionary - Args: - attrib: attribute of an xml element - Returns: - text: extracted text from attribute values, - either after '#' or after the final '/' - if '#' does not exist - """ - attrib = list(attrib.values())[0] - text = None - if "#" in attrib: - text = attrib.split("#")[-1] - else: - text = attrib.split("/")[-1] - return text + """Extract text from xml attributes dictionary + Args: + attrib: attribute of an xml element + Returns: + text: extracted text from attribute values, + either after '#' or after the final '/' + if '#' does not exist + """ + attrib = list(attrib.values())[0] + text = None + if "#" in attrib: + text = attrib.split("#")[-1] + else: + text = attrib.split("/")[-1] + return text def parse_do_info(info: list) -> dict: - """Parse owl class childrens - to human-readble dictionary - Args: - info: list of owl class children - Returns: - info_dict: human_readable dictionary - containing information of owl class children - """ - info_dict = defaultdict(list) - for element in info: - tag = format_tag(element.tag) - if element.text == None: - text = format_attrib(element.attrib) - info_dict[tag].append(text) - else: - info_dict[tag].append(element.text) - return info_dict + """Parse owl class childrens + to human-readble dictionary + Args: + info: list of owl class children + Returns: + info_dict: human_readable dictionary + containing information of owl class children + """ + info_dict = defaultdict(list) + for element in info: + tag = format_tag(element.tag) + if element.text == None: + text = format_attrib(element.attrib) + info_dict[tag].append(text) + else: + info_dict[tag].append(element.text) + return info_dict def format_cols(df): - """ - Converts all columns to string type and - replaces all special characters - Args: - df = dataframe to change - Returns: - none - """ - for i, col in enumerate(df.columns): - df[col] = df[col].astype(str) - df[col] = df[col].map(lambda x: re.sub(r'[\([{})\]]', '', x)) - df.iloc[:, i] = df.iloc[:, i].str.replace("'", '') - df[col] = df[col].replace('nan', np.nan) - df['id'] = df['id'].str.replace(':', '_') - df['hasAlternativeId'] = df['hasAlternativeId'].str.replace(':', '_') - + """ + Converts all columns to string type and + replaces all special characters + Args: + df = dataframe to change + Returns: + none + """ + for i, col in enumerate(df.columns): + df[col] = df[col].astype(str) + df[col] = df[col].map(lambda x: re.sub(r'[\([{})\]]', '', x)) + df.iloc[:, i] = df.iloc[:, i].str.replace("'", '') + df[col] = df[col].replace('nan', np.nan) + df['id'] = df['id'].str.replace(':', '_') + df['hasAlternativeId'] = df['hasAlternativeId'].str.replace(':', '_') + +def replace_nan_func(x): + """ + Combines all NaN rows with same ID + Args: + df = dataframe to change + Returns: + none + """ + x = x[~pd.isna(x)] + if len(x) > 0: + return x.iloc[0] + else: + return np.NaN def col_explode(df): - """ - Splits the hasDbXref column into multiple columns - based on the prefix identifying the database from which - the ID originates - Args: - df = dataframe to change - Returns - df = modified dataframe - """ - df = df.assign(hasDbXref=df.hasDbXref.str.split(",")).explode('hasDbXref') - df[['A', 'B']] = df['hasDbXref'].str.split(':', 1, expand=True) - df['A'] = df['A'].astype(str).map(lambda x: re.sub('[^A-Za-z0-9]+', '', x)) - col_add = list(df['A'].unique()) - for newcol in col_add: - df[newcol] = np.nan - df[newcol] = np.where(df['A'] == newcol, df['B'], np.nan) - df[newcol] = df[newcol].astype(str).replace("nan", np.nan) - df['hasAlternativeId'] = df['hasAlternativeId'].str.split(',') - df = df.explode('hasAlternativeId') - - return df + """ + Splits the hasDbXref column into multiple columns + based on the prefix identifying the database from which + the ID originates + Args: + df = dataframe to change + Returns + df = modified dataframe + """ + df = df.assign(hasDbXref=df.hasDbXref.str.split(",")).explode('hasDbXref') + df[['A', 'B']] = df['hasDbXref'].str.split(':', 1, expand=True) + df['A'] = df['A'].astype(str).map(lambda x: re.sub('[^A-Za-z0-9]+', '', x)) + col_add = list(df['A'].unique()) + for newcol in col_add: + df[newcol] = np.nan + df[newcol] = np.where(df['A'] == newcol, df['B'], np.nan) + df[newcol] = df[newcol].astype(str).replace("nan", np.nan) + df['hasAlternativeId'] = df['hasAlternativeId'].str.split(',') + df = df.explode('hasAlternativeId') + df1 = df.groupby(by='id').agg(dict.fromkeys(df.columns[0:], replace_nan_func)) + return df1 def shard(list_to_shard, shard_size): - """ - Breaks down a list into smaller - sublists, converts it into an array - and appends the array to the master - list - Args: - list_to_shard = original list - shard_size = size of subist - Returns: - sharded_list = master list with - smaller sublists - """ - sharded_list = [] - for i in range(0, len(list_to_shard), shard_size): - shard = list_to_shard[i:i + shard_size] - arr = np.array(shard) - sharded_list.append(arr) - return sharded_list + """ + Breaks down a list into smaller + sublists, converts it into an array + and appends the array to the master + list + Args: + list_to_shard = original list + shard_size = size of subist + Returns: + sharded_list = master list with + smaller sublists + """ + sharded_list = [] + for i in range(0, len(list_to_shard), shard_size): + shard = list_to_shard[i:i + shard_size] + arr = np.array(shard) + sharded_list.append(arr) + return sharded_list def mesh_separator(df): - """ - Splits the mesh column into mesh descriptor and concept ID - Args: - df: pandas dataframe with mesh column - - Returns: - df: dataframe with split mesh column - """ - - df['meshDescriptor'] = np.where(df['MESH'].str[0] == 'D', df['MESH'], np.nan) - df['meshDescriptor'] = "bio/" + df['meshDescriptor'] - df['meshConcept'] = np.where(df['MESH'].str[0] == 'C', df['MESH'], np.nan) - df['meshConcept'] = "bio/" + df['meshConcept'] - df = df.drop(['MESH'], axis = 1) - return df + """ + Splits the mesh column into mesh descriptor and concept ID + Args: + df: pandas dataframe with mesh column + + Returns: + df: dataframe with split mesh column + """ + + df['meshDescriptor'] = np.where(df['MESH'].str[0] == 'D', df['MESH'], np.nan) + df['meshDescriptor'] = "bio/" + df['meshDescriptor'] + df['meshConcept'] = np.where(df['MESH'].str[0] == 'C', df['MESH'], np.nan) + df['meshConcept'] = "bio/" + df['meshConcept'] + df = df.drop(['MESH'], axis = 1) + return df def col_string(df): - """ - Adds string quotes to columns in a dataframe - Args: - df = dataframe whose columns are modified - Returns: - None - """ - col_names = ['hasExactSynonym', 'label', 'IAO_0000115'] - for col in col_names: - df[col] = df[col].str.replace('"', "") - df.update('"' + df[[col]].astype(str) + '"') - df[col] = df[col].replace(["\"nan\""],np.nan) - return df + """ + Adds string quotes to columns in a dataframe + Args: + df = dataframe whose columns are modified + Returns: + None + """ + col_names = ['hasExactSynonym', 'label', 'IAO_0000115'] + for col in col_names: + df[col] = df[col].str.replace('"', "") + df.update('"' + df[[col]].astype(str) + '"') + df[col] = df[col].replace(["\"nan\""],np.nan) + return df def remove_newline(df): - df.loc[2505, 'IAO_0000115'] = df.loc[2505, 'IAO_0000115'].replace("\\n", "") - df.loc[2860, 'IAO_0000115'] = df.loc[2860, 'IAO_0000115'].replace("\\n", "") - df.loc[2895, 'IAO_0000115'] = df.loc[2895, 'IAO_0000115'].replace("\\n", "") - df.loc[2934, 'IAO_0000115'] = df.loc[2934, 'IAO_0000115'].replace("\\n", "") - df.loc[3036, 'IAO_0000115'] = df.loc[3036, 'IAO_0000115'].replace("\\n", "") - df.loc[11304, 'IAO_0000115'] = df.loc[11304, 'IAO_0000115'].replace("\\n", "") - return df + df.loc[1735, 'IAO_0000115'] = df.loc[1735, 'IAO_0000115'].replace("\\n", "") + df.loc[2513, 'IAO_0000115'] = df.loc[2513, 'IAO_0000115'].replace("\\n", "") + df.loc[2869, 'IAO_0000115'] = df.loc[2869, 'IAO_0000115'].replace("\\n", "") + df.loc[2904, 'IAO_0000115'] = df.loc[2904, 'IAO_0000115'].replace("\\n", "") + df.loc[2943, 'IAO_0000115'] = df.loc[2943, 'IAO_0000115'].replace("\\n", "") + df.loc[3045, 'IAO_0000115'] = df.loc[3045, 'IAO_0000115'].replace("\\n", "") + df.loc[11689, 'IAO_0000115'] = df.loc[11689, 'IAO_0000115'].replace("\\n", "") + return df def create_dcid(df): - df['diseaseId'] = df['id'] - df['diseaseId'] = df['diseaseId'].str.replace("_", ":") - df['hasAlternativeId'] = df['hasAlternativeId'].str.strip() - col_names = ['id', 'subClassOf', 'hasAlternativeId'] - for col in col_names: - df[col] = "bio/" + df[col] - df[col] = df[col].replace(["bio/nan"],np.nan) - df['ICD10CM'] = "ICD10/" + df['ICD10CM'].astype(str) - df['ICD10CM'] = df['ICD10CM'].replace("ICD10/nan", np.nan) - df.update('"' + df[['diseaseId']].astype(str) + '"') - return df + df['diseaseId'] = df['id'] + df['diseaseId'] = df['diseaseId'].str.replace("_", ":") + df['hasAlternativeId'] = df['hasAlternativeId'].str.strip() + col_names = ['id', 'subClassOf', 'hasAlternativeId'] + for col in col_names: + df[col] = "bio/" + df[col] + df[col] = df[col].replace(["bio/nan"],np.nan) + df['ICD10CM'] = "ICD10/" + df['ICD10CM'].astype(str) + df['ICD10CM'] = df['ICD10CM'].replace("ICD10/nan", np.nan) + df.update('"' + df[['diseaseId']].astype(str) + '"') + return df def wrapper_fun(file_input): - # Read disease ontology .owl file - tree = ElementTree.parse(file_input) - # Get file root - root = tree.getroot() - # Find owl classes elements - all_classes = root.findall('{http://www.w3.org/2002/07/owl#}Class') - # Parse owl classes to human-readble dictionary format - parsed_owl_classes = [] - for owl_class in all_classes: - info = list(owl_class.iter()) - parsed_owl_classes.append(parse_do_info(info)) - # Convert to pandas Dataframe - df_do = pd.DataFrame(parsed_owl_classes) - format_cols(df_do) - df_do = df_do.drop([ - 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', - 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', - 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', - 'inSubset', 'hasOBONamespace' - ], - axis=1) - df_do = col_explode(df_do) - df_do = mesh_separator(df_do) - df_do = col_string(df_do) - df_do = df_do.drop(['A', 'B', 'nan', 'hasDbXref', 'KEGG'], axis=1) - df_do = df_do.drop_duplicates(subset='id', keep="last") - df_do = df_do.reset_index(drop=True) - df_do = df_do.replace('"nan"', np.nan) - df_do = create_dcid(df_do) - df_do = remove_newline(df_do) - df_do['IAO_0000115'] = df_do['IAO_0000115'].str.replace("_", " ") - return df_do + # Read disease ontology .owl file + tree = ElementTree.parse(file_input) + # Get file root + root = tree.getroot() + # Find owl classes elements + all_classes = root.findall('{http://www.w3.org/2002/07/owl#}Class') + # Parse owl classes to human-readble dictionary format + parsed_owl_classes = [] + for owl_class in all_classes: + info = list(owl_class.iter()) + parsed_owl_classes.append(parse_do_info(info)) + # Convert to pandas Dataframe + df_do = pd.DataFrame(parsed_owl_classes) + format_cols(df_do) + df_do = df_do.drop([ + 'Class', 'exactMatch', 'deprecated', 'hasRelatedSynonym', 'comment', + 'OBI_9991118', 'narrowMatch', 'hasBroadSynonym', 'disjointWith', + 'hasNarrowSynonym', 'broadMatch', 'created_by', 'creation_date', + 'inSubset', 'hasOBONamespace' + ], + axis=1) + df_do = col_explode(df_do) + df_do = mesh_separator(df_do) + df_do = col_string(df_do) + df_do = df_do.drop(['A', 'B', 'nan', 'hasDbXref', 'KEGG'], axis=1) + df_do = df_do.drop_duplicates(subset='id', keep="last") + df_do = df_do.reset_index(drop=True) + df_do = df_do.replace('"nan"', np.nan) + df_do = create_dcid(df_do) + df_do = remove_newline(df_do) + df_do['IAO_0000115'] = df_do['IAO_0000115'].str.replace("_", " ") + return df_do def main(): - file_input = sys.argv[1] - file_output = sys.argv[2] - df = wrapper_fun(file_input) - df.columns = ['diseaseDescription' if x=='IAO_0000115' else x for x in df.columns] - df.to_csv(file_output, doublequote=False, escapechar='\\') + file_input = sys.argv[1] + file_output = sys.argv[2] + df = wrapper_fun(file_input) + df.columns = ['diseaseDescription' if x=='IAO_0000115' else x for x in df.columns] + df.to_csv(file_output, doublequote=False, escapechar='\\') if __name__ == '__main__': - main() + main() + + + + + + + + From 4d0c493f5fbfac5a7074f580a58ea9baf785386a Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 31 Jul 2023 20:07:53 -0500 Subject: [PATCH 27/28] feat:update format_disease_ontology.py --- scripts/biomedical/diseaseOntology/format_disease_ontology.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index 4dc387ff74..b4bbce3a0c 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -208,8 +208,8 @@ def create_dcid(df): for col in col_names: df[col] = "bio/" + df[col] df[col] = df[col].replace(["bio/nan"],np.nan) - df['ICD10CM'] = "ICD10/" + df['ICD10CM'].astype(str) - df['ICD10CM'] = df['ICD10CM'].replace("ICD10/nan", np.nan) + df['ICD10CM'] = "dcid:ICD10/" + df['ICD10CM'].astype(str) + df['ICD10CM'] = df['ICD10CM'].replace("dcid:ICD10/nan", np.nan) df.update('"' + df[['diseaseId']].astype(str) + '"') return df From 5f0c1a181e5038d069250b9aeda2d6d2025ec5c8 Mon Sep 17 00:00:00 2001 From: Suhana Bedi Date: Mon, 14 Aug 2023 14:22:13 -0500 Subject: [PATCH 28/28] feat: add illegal char check --- .../format_disease_ontology.py | 56 +++++++------------ 1 file changed, 21 insertions(+), 35 deletions(-) diff --git a/scripts/biomedical/diseaseOntology/format_disease_ontology.py b/scripts/biomedical/diseaseOntology/format_disease_ontology.py index b4bbce3a0c..57bec1fc0f 100644 --- a/scripts/biomedical/diseaseOntology/format_disease_ontology.py +++ b/scripts/biomedical/diseaseOntology/format_disease_ontology.py @@ -97,7 +97,7 @@ def format_cols(df): df.iloc[:, i] = df.iloc[:, i].str.replace("'", '') df[col] = df[col].replace('nan', np.nan) df['id'] = df['id'].str.replace(':', '_') - df['hasAlternativeId'] = df['hasAlternativeId'].str.replace(':', '_') + #df['hasAlternativeId'] = df['hasAlternativeId'].str.replace(':', '_') def replace_nan_func(x): """ @@ -125,17 +125,17 @@ def col_explode(df): df = modified dataframe """ df = df.assign(hasDbXref=df.hasDbXref.str.split(",")).explode('hasDbXref') - df[['A', 'B']] = df['hasDbXref'].str.split(':', 1, expand=True) + df[['A', 'B']] = df['hasDbXref'].str.split(':', n=1, expand=True) df['A'] = df['A'].astype(str).map(lambda x: re.sub('[^A-Za-z0-9]+', '', x)) col_add = list(df['A'].unique()) for newcol in col_add: df[newcol] = np.nan df[newcol] = np.where(df['A'] == newcol, df['B'], np.nan) df[newcol] = df[newcol].astype(str).replace("nan", np.nan) - df['hasAlternativeId'] = df['hasAlternativeId'].str.split(',') - df = df.explode('hasAlternativeId') - df1 = df.groupby(by='id').agg(dict.fromkeys(df.columns[0:], replace_nan_func)) - return df1 + #df['hasAlternativeId'] = df['hasAlternativeId'].str.split(',') + #df = df.explode('hasAlternativeId') + #df1 = df.groupby(by='id').agg(dict.fromkeys(df.columns[0:], replace_nan_func)) + return df def shard(list_to_shard, shard_size): @@ -190,21 +190,12 @@ def col_string(df): df[col] = df[col].replace(["\"nan\""],np.nan) return df -def remove_newline(df): - df.loc[1735, 'IAO_0000115'] = df.loc[1735, 'IAO_0000115'].replace("\\n", "") - df.loc[2513, 'IAO_0000115'] = df.loc[2513, 'IAO_0000115'].replace("\\n", "") - df.loc[2869, 'IAO_0000115'] = df.loc[2869, 'IAO_0000115'].replace("\\n", "") - df.loc[2904, 'IAO_0000115'] = df.loc[2904, 'IAO_0000115'].replace("\\n", "") - df.loc[2943, 'IAO_0000115'] = df.loc[2943, 'IAO_0000115'].replace("\\n", "") - df.loc[3045, 'IAO_0000115'] = df.loc[3045, 'IAO_0000115'].replace("\\n", "") - df.loc[11689, 'IAO_0000115'] = df.loc[11689, 'IAO_0000115'].replace("\\n", "") - return df - def create_dcid(df): df['diseaseId'] = df['id'] df['diseaseId'] = df['diseaseId'].str.replace("_", ":") - df['hasAlternativeId'] = df['hasAlternativeId'].str.strip() - col_names = ['id', 'subClassOf', 'hasAlternativeId'] + df['subClassOf'] = df['subClassOf'].str.split(',') + df['subClassOf'] = df['subClassOf'].str[0] + col_names = ['id', 'subClassOf'] for col in col_names: df[col] = "bio/" + df[col] df[col] = df[col].replace(["bio/nan"],np.nan) @@ -213,6 +204,16 @@ def create_dcid(df): df.update('"' + df[['diseaseId']].astype(str) + '"') return df +def check_for_illegal_charc(s): + list_illegal = ["'", "#", "–", "*" ">", "<", "@", "]", "[", "|", ":", ";", " "] + if any([x in s for x in list_illegal]): + print('Error! dcid contains illegal characters!', s) + +def check_for_dcid(row): + check_for_illegal_charc(str(row['id'])) + check_for_illegal_charc(str(row['subClassOf'])) + return row + def wrapper_fun(file_input): # Read disease ontology .owl file @@ -244,8 +245,8 @@ def wrapper_fun(file_input): df_do = df_do.reset_index(drop=True) df_do = df_do.replace('"nan"', np.nan) df_do = create_dcid(df_do) - df_do = remove_newline(df_do) df_do['IAO_0000115'] = df_do['IAO_0000115'].str.replace("_", " ") + df_do = df_do.apply(lambda x: check_for_dcid(x),axis=1) return df_do def main(): @@ -257,19 +258,4 @@ def main(): if __name__ == '__main__': - main() - - - - - - - - - - - - - - - + main() \ No newline at end of file