-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Add: makefile: Missing make goals for dependencies for slurp goal. - Add: Python: CLI - Update: makefile: Slurp goal: (i) named keys/vals for all params, (ii) standardization in file/path params. - Update: Python: Completed script, inspired by initial psuedo code. (WIP)
- Loading branch information
Showing
2 changed files
with
93 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,93 @@ | ||
"""Migration pipeline | ||
"""Slurp migration pipeline | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
#### THIS IS PSEUDO CODE NOT PYTHON OR ANYTHING | ||
TODOs: | ||
TODO's: | ||
- add CLI: look to makefile for what to include | ||
""" | ||
import oakliblib | ||
import pandas | ||
import os | ||
from argparse import ArgumentParser | ||
from typing import Dict, List | ||
|
||
import oaklib | ||
import pandas as pd | ||
|
||
#Inputs: | ||
source_ontology = '' #e.g. omim | ||
sssom_map = '' # e.g. mondo.sssom.tsv | ||
min_id = '' | ||
termlist_mondo = '' | ||
|
||
# TODO: implement this func: | ||
# todo: IDs should be int or str? prolly str | ||
def determine_next_available_mondo_id(min_id: str, mondo_termlist_df: pd.DataFrame) -> str: | ||
"""Starting from `min_id`, count up and check until finding the next ID.""" | ||
next_id = str(0) | ||
return next_id | ||
|
||
def run(source_ontology = '', sssom_map = '', min_id = '', termlist_mondo = ''): | ||
"""source_ontology = '' #e.g. omim | ||
sssom_map = '' # e.g. mondo.sssom.tsv | ||
min_id = '' | ||
termlist_mondo = ''""" | ||
#Outputs: | ||
data = [] | ||
|
||
for t in source_ontology: | ||
if t not in sssom_map['object_id']: | ||
parents = [] | ||
def run(ontology_path: str, sssom_map_path: str, min_id: str, mondo_terms_path: str, outpath: str) -> pd.DataFrame: | ||
"""Run slurp pipeline for given ontology""" | ||
# TODO: read this with OAK | ||
source_ontology = ontology_path | ||
sssom_df = pd.read_csv(sssom_map_path, comment='#', sep='\t') | ||
# TODO: Need to get the mondo terms, but ran out of memory on my alternate PC. get from other PC. | ||
print(f'exists: {mondo_terms_path}: ', os.path.exists(os.path.join(os.getcwd(), mondo_terms_path))) | ||
# mondo_termlist_df = pd.read_csv(mondo_terms_path, comment='#', sep='\t') | ||
mondo_termlist_df = pd.DataFrame() | ||
|
||
source_onto_terms: List[str] = [] | ||
sssom_object_ids = set(sssom_df['object_id']) | ||
|
||
data = [] | ||
for t in source_onto_terms: | ||
if t not in sssom_object_ids: | ||
migrate = True | ||
for p in oaklib.get_direct_parents(t): | ||
if p not in sssom_map['object_id']: | ||
# todo: Find the correct way of doing this: | ||
parents: List[str] = oaklib.get_direct_parents(t) | ||
for parent in parents: | ||
if parent not in sssom_object_ids: | ||
migrate = False | ||
break | ||
elif sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:exactMatch' \ | ||
or sssom_map[sssom_map['object_id']==p]['predicate_id'] = 'skos:narrowMatch': | ||
# In other words, if the parent is mapped, and the mapping is either exact or narrower | ||
parents.append(sssom_map[sssom_map['object_id']==p]['subject_id']) | ||
else: | ||
# Its fine, just continue looking for other parents in this case | ||
obj_data = sssom_df[sssom_df['object_id'] == parent] | ||
pred = str(obj_data['predicate_id']) | ||
if pred in ['skos:exactMatch', 'skos:narrowMatch']: | ||
# In other words, if the parent is mapped, and the mapping is either exact or narrower | ||
parents.append(obj_data['subject_id']) | ||
else: | ||
pass # Its fine, just continue looking for other parents in this case | ||
if migrate and parents: | ||
next_mondo_id = determine_next_available_mondo_id(min_id, termlist_mondo) # satrting from min_id, then counting up and checking if it does not already exist. | ||
# TODO: implement this func: | ||
next_mondo_id = determine_next_available_mondo_id(min_id, mondo_termlist_df) # satrting from min_id, then counting up and checking if it does not already exist. | ||
# todo: Find the correct way of doing this: | ||
label = oaklib.get_label(t) | ||
# todo: Find the correct way of doing this: | ||
definition = oaklib.get_definition(t) | ||
data.append({'mondo_id':next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) | ||
data.append({'mondo_id': next_mondo_id, 'xref': t, 'label': label, 'definition': definition}) | ||
|
||
result = pd.DataFrame(data) | ||
result.to_csv(outpath, sep="\t") | ||
return result | ||
|
||
|
||
pandas.DataFrame(data).to_csv(fn, sep="\t") | ||
def cli(): | ||
"""Command line interface.""" | ||
package_description = \ | ||
'Slurp pipeline: Integrate new terms from other ontologies into Mondo.' | ||
parser = ArgumentParser(description=package_description) | ||
parser.add_argument( | ||
'-o', '--ontology-path', required=True, | ||
help='xxxxxx') | ||
parser.add_argument( | ||
'-m', '--sssom-map-path', required=True, | ||
help='xxxxxx') | ||
parser.add_argument( | ||
'-i', '--min-id', required=True, | ||
help='xxxxxx') | ||
parser.add_argument( | ||
'-t', '--mondo-terms-path', required=True, | ||
help='xxxxxx') | ||
parser.add_argument( | ||
'-O', '--outpath', required=True, | ||
help='xxxxxx') | ||
d: Dict = vars(parser.parse_args()) | ||
# todo: Convert paths to absolute paths, as I've done before? Or expect always be run from src/ontology and ok? | ||
run(**d) | ||
|
||
|
||
if __name__ == '__main__': | ||
run() | ||
cli() |