From 4845aa0ada9179470c47b986bbae32807a6e3710 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Tue, 1 Oct 2024 14:50:05 -0700 Subject: [PATCH 01/24] Reformat via linter --- metaMS/cli.py | 117 ++++++++++++++--------- metaMS/gcmsWorkflow.py | 211 ++++++++++++++++++++++++++--------------- 2 files changed, 207 insertions(+), 121 deletions(-) diff --git a/metaMS/cli.py b/metaMS/cli.py index 3107c92..4e991b0 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -3,73 +3,98 @@ import click import toml - from corems.encapsulation.output.parameter_to_json import dump_gcms_settings_toml -from metaMS.gcmsWorkflow import WorkflowParameters, run_gcms_metabolomics_workflow, run_gcms_metabolomics_workflow_wdl, run_nmdc_metabolomics_workflow +from metaMS.gcmsWorkflow import ( + WorkflowParameters, + run_gcms_metabolomics_workflow, + run_gcms_metabolomics_workflow_wdl, + run_nmdc_metabolomics_workflow, +) @click.group() def cli(): - #saving for toplevel options + # saving for toplevel options pass + @cli.command() -@click.argument('file_paths', required=True, type=str) -@click.argument('calibration_file_path', required=True, type=str) -@click.argument('output_directory', required=True, type=str) -@click.argument('output_filename', required=True, type=str) -@click.argument('output_type', required=True, type=str) -@click.argument('corems_toml_path', required=True, type=str) -@click.argument('nmdc_metadata_path', required=True, type=str) -@click.option('--jobs','-j', default=4, help="'cpu's'") -def run_gcms_wdl_workflow(file_paths, calibration_file_path, output_directory,output_filename, output_type, corems_toml_path, nmdc_metadata_path, jobs): - '''Run the GCMS workflow\n - gcms_workflow_paramaters_toml_file = toml file with workflow parameters\n - output_types = csv, excel, pandas, json set on the parameter file\n - corems_toml_path = toml file with corems parameters\n - --jobs = number of processes to run in parallel\n - ''' - click.echo('Running gcms workflow') - run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output_directory,output_filename, output_type, corems_toml_path, jobs) +@click.argument("file_paths", required=True, type=str) +@click.argument("calibration_file_path", required=True, type=str) +@click.argument("output_directory", required=True, type=str) +@click.argument("output_filename", required=True, type=str) +@click.argument("output_type", required=True, type=str) +@click.argument("corems_toml_path", required=True, type=str) +@click.argument("nmdc_metadata_path", required=True, type=str) +@click.option("--jobs", "-j", default=4, help="'cpu's'") +def run_gcms_wdl_workflow( + file_paths, + calibration_file_path, + output_directory, + output_filename, + output_type, + corems_toml_path, + nmdc_metadata_path, + jobs, +): + """Run the GCMS workflow\n + gcms_workflow_paramaters_toml_file = toml file with workflow parameters\n + output_types = csv, excel, pandas, json set on the parameter file\n + corems_toml_path = toml file with corems parameters\n + --jobs = number of processes to run in parallel\n + """ + click.echo("Running gcms workflow") + run_gcms_metabolomics_workflow_wdl( + file_paths, + calibration_file_path, + output_directory, + output_filename, + output_type, + corems_toml_path, + jobs, + ) + @cli.command() -@click.argument('gcms_workflow_paramaters_file', required=True, type=str) -@click.option('--jobs','-j', default=4, help="'cpu's'") -@click.option('--nmdc', '-n', is_flag=True, help="Creates NMDC metadata mapping and save each result individually") +@click.argument("gcms_workflow_paramaters_file", required=True, type=str) +@click.option("--jobs", "-j", default=4, help="'cpu's'") +@click.option( + "--nmdc", + "-n", + is_flag=True, + help="Creates NMDC metadata mapping and save each result individually", +) def run_gcms_workflow(gcms_workflow_paramaters_file, jobs, nmdc): - '''Run the GCMS workflow\n - gcms_workflow_paramaters_toml_file = toml file with workflow parameters\n - output_types = csv, excel, pandas, toml set on the parameter file\n - corems_toml_path = toml file with corems parameters\n - --jobs = number of processes to run in parallel\n - ''' - click.echo('Running gcms workflow') + """Run the GCMS workflow\n + gcms_workflow_paramaters_toml_file = toml file with workflow parameters\n + output_types = csv, excel, pandas, toml set on the parameter file\n + corems_toml_path = toml file with corems parameters\n + --jobs = number of processes to run in parallel\n + """ + click.echo("Running gcms workflow") if nmdc: run_nmdc_metabolomics_workflow(gcms_workflow_paramaters_file, jobs) else: run_gcms_metabolomics_workflow(gcms_workflow_paramaters_file, jobs) + @cli.command() -@click.argument('toml_file_name', required=True, type=str) +@click.argument("toml_file_name", required=True, type=str) def dump_toml_template(toml_file_name): - '''Dumps a toml file template - to be used as the workflow parameters input - ''' - ref_lib_path = Path(toml_file_name).with_suffix('.toml') - with open(ref_lib_path, 'w') as workflow_param: - + """Dumps a toml file template + to be used as the workflow parameters input + """ + ref_lib_path = Path(toml_file_name).with_suffix(".toml") + with open(ref_lib_path, "w") as workflow_param: toml.dump(WorkflowParameters().__dict__, workflow_param) + @cli.command() -@click.argument('toml_file_name', required=True, type=str) +@click.argument("toml_file_name", required=True, type=str) def dump_corems_toml_template(toml_file_name): - '''Dumps a CoreMS toml file template - to be used as the workflow parameters input - ''' - path_obj = Path(toml_file_name).with_suffix('.toml') + """Dumps a CoreMS toml file template + to be used as the workflow parameters input + """ + path_obj = Path(toml_file_name).with_suffix(".toml") dump_gcms_settings_toml(file_path=path_obj) - - - - \ No newline at end of file diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index fbc6aab..9fc9725 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -11,27 +11,37 @@ import cProfile + @dataclass class WorkflowParameters: - - file_paths: tuple = ('data/...', 'data/...') - #RI FAMES Calibration File - calibration_file_path: str = 'data/...' - #Sample/Process Metadata - nmdc_metadata_path: str = 'configuration/nmdc_metadata.json' - #configuration file for corems - corems_toml_path: str = 'configuration/corems.toml' - output_directory: str = 'data/...' - output_filename: str = 'data/...' - output_type: str = 'csv' - -def worker(args): + file_paths: tuple = ("data/...", "data/...") + # RI FAMES Calibration File + calibration_file_path: str = "data/..." + # Sample/Process Metadata + nmdc_metadata_path: str = "configuration/nmdc_metadata.json" + # configuration file for corems + corems_toml_path: str = "configuration/corems.toml" + output_directory: str = "data/..." + output_filename: str = "data/..." + output_type: str = "csv" - cProfile.runctx('workflow_worker(args)', globals(), locals(), 'gc-ms.prof') -def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output_directory,output_filename, output_type, corems_toml_path, jobs, db_path=None): - +def worker(args): + cProfile.runctx("workflow_worker(args)", globals(), locals(), "gc-ms.prof") + + +def run_gcms_metabolomics_workflow_wdl( + file_paths, + calibration_file_path, + output_directory, + output_filename, + output_type, + corems_toml_path, + jobs, + db_path=None, +): import click + workflow_params = WorkflowParameters() workflow_params.file_paths = file_paths.split(",") workflow_params.calibration_file_path = calibration_file_path @@ -39,98 +49,138 @@ def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output workflow_params.output_filename = output_filename workflow_params.output_type = output_type workflow_params.corems_toml_path = corems_toml_path - + dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) - output_path = Path(workflow_params.output_directory)/workflow_params.output_filename - - rt_ri_pairs = get_calibration_rtri_pairs(workflow_params.calibration_file_path, workflow_params.corems_toml_path) - - worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path ) for file_path in workflow_params.file_paths] - #gcms_list = pool.map(workflow_worker, worker_args) + output_path = ( + Path(workflow_params.output_directory) / workflow_params.output_filename + ) + + rt_ri_pairs = get_calibration_rtri_pairs( + workflow_params.calibration_file_path, workflow_params.corems_toml_path + ) + + worker_args = [ + ( + file_path, + rt_ri_pairs, + workflow_params.corems_toml_path, + workflow_params.calibration_file_path, + ) + for file_path in workflow_params.file_paths + ] + # gcms_list = pool.map(workflow_worker, worker_args) pool = Pool(int(jobs)) - + for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - eval('gcms.to_'+ workflow_params.output_type + '(output_path)') + eval("gcms.to_" + workflow_params.output_type + "(output_path)") pool.close() pool.join() + def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): - import click - dms_file_path = 'db/GC-MS Metabolomics Experiments to Process Final.xlsx' - - click.echo('Loading Searching Settings from %s' % workflow_params_file) + + dms_file_path = "db/GC-MS Metabolomics Experiments to Process Final.xlsx" + + click.echo("Loading Searching Settings from %s" % workflow_params_file) workflow_params = read_workflow_parameter(workflow_params_file) - + dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) - - rt_ri_pairs = get_calibration_rtri_pairs(workflow_params.calibration_file_path, workflow_params.corems_toml_path) - worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] - #gcms_list = pool.map(workflow_worker, worker_args) + rt_ri_pairs = get_calibration_rtri_pairs( + workflow_params.calibration_file_path, workflow_params.corems_toml_path + ) + + worker_args = [ + ( + file_path, + rt_ri_pairs, + workflow_params.corems_toml_path, + workflow_params.calibration_file_path, + ) + for file_path in workflow_params.file_paths + ] + # gcms_list = pool.map(workflow_worker, worker_args) pool = Pool(jobs) - + for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - in_file_path = Path(workflow_params.file_paths[i]) - output_path = Path(workflow_params.output_directory)/in_file_path.name + output_path = Path(workflow_params.output_directory) / in_file_path.name + + eval( + "gcms.to_" + + workflow_params.output_type + + "(output_path, write_metadata=False)" + ) - eval('gcms.to_'+ workflow_params.output_type + '(output_path, write_metadata=False)') - - #nmdc = NMDC_Metadata(in_file_path, workflow_params.calibration_file_path, output_path, dms_file_path) - #nmdc.create_nmdc_metadata(gcms) + # nmdc = NMDC_Metadata(in_file_path, workflow_params.calibration_file_path, output_path, dms_file_path) + # nmdc.create_nmdc_metadata(gcms) pool.close() pool.join() - + def run_gcms_metabolomics_workflow(workflow_params_file, jobs): import click - click.echo('Loading Searching Settings from %s' % workflow_params_file) + + click.echo("Loading Searching Settings from %s" % workflow_params_file) workflow_params = read_workflow_parameter(workflow_params_file) dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) - output_path = Path(workflow_params.output_directory)/workflow_params.output_filename - - rt_ri_pairs = get_calibration_rtri_pairs(workflow_params.calibration_file_path, workflow_params.corems_toml_path) - - worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] - #gcms_list = pool.map(workflow_worker, worker_args) + output_path = ( + Path(workflow_params.output_directory) / workflow_params.output_filename + ) + + rt_ri_pairs = get_calibration_rtri_pairs( + workflow_params.calibration_file_path, workflow_params.corems_toml_path + ) + + worker_args = [ + ( + file_path, + rt_ri_pairs, + workflow_params.corems_toml_path, + workflow_params.calibration_file_path, + ) + for file_path in workflow_params.file_paths + ] + # gcms_list = pool.map(workflow_worker, worker_args) pool = Pool(jobs) - + for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - eval('gcms.to_'+ workflow_params.output_type + '(output_path)') + eval("gcms.to_" + workflow_params.output_type + "(output_path)") pool.close() pool.join() - + + def read_workflow_parameter(gcms_workflow_paramaters_toml_file): - with open(gcms_workflow_paramaters_toml_file, 'r') as infile: - return WorkflowParameters(**toml.load(infile)) + with open(gcms_workflow_paramaters_toml_file, "r") as infile: + return WorkflowParameters(**toml.load(infile)) + def get_calibration_rtri_pairs(ref_file_path, corems_paramaters_toml_file): - gcms_ref_obj = get_gcms(ref_file_path, corems_paramaters_toml_file) - #sql_obj = start_sql_from_file() - #rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj,sql_obj=sql_obj) + # sql_obj = start_sql_from_file() + # rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj,sql_obj=sql_obj) # !!!!!! READ !!!!! use the previous two lines if db/EMSL_lowres_gcms_test_database.sqlite does not exist # and comment the next line rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj) return rt_ri_pairs + def workflow_worker(args): - file_path, ref_dict, corems_params, cal_file_path = args - + gcms = get_gcms(file_path, corems_params) - + gcms.calibrate_ri(ref_dict, cal_file_path) - + # sql_obj = start_sql_from_file() # lowResSearch = LowResMassSpectralMatch(gcms, sql_obj=sql_obj) # !!!!!! READ !!!!! use the previous two lines if db/pnnl_lowres_gcms_compounds.sqlite does not exist @@ -140,23 +190,24 @@ def workflow_worker(args): return gcms + def get_gcms(file_path, corems_params): - reader_gcms = ReadAndiNetCDF(file_path) - + reader_gcms.run() - + gcms = reader_gcms.get_gcms_obj() - parameter_from_json.load_and_set_toml_parameters_gcms(gcms, parameters_path=corems_params) - + parameter_from_json.load_and_set_toml_parameters_gcms( + gcms, parameters_path=corems_params + ) + gcms.process_chromatogram() - return gcms + def start_sql_from_file(): - from pathlib import Path from corems.molecular_id.input.nistMSI import ReadNistMSI @@ -167,19 +218,29 @@ def start_sql_from_file(): def run_gcms_mpi(workflow_params_file, replicas, rt_ri_pairs): - import os, sys - sys.path.append(os.getcwd()) + + sys.path.append(os.getcwd()) from mpi4py import MPI - + workflow_params = read_workflow_parameter(workflow_params_file) - rt_ri_pairs = get_calibration_rtri_pairs(workflow_params.calibration_file_path, workflow_params.corems_toml_path) - worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] + rt_ri_pairs = get_calibration_rtri_pairs( + workflow_params.calibration_file_path, workflow_params.corems_toml_path + ) + worker_args = [ + ( + file_path, + rt_ri_pairs, + workflow_params.corems_toml_path, + workflow_params.calibration_file_path, + ) + for file_path in workflow_params.file_paths + ] comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() - + # will only run tasks up to the number of files paths selected in the EnviroMS File if rank < len(worker_args): - workflow_worker(worker_args[rank]) \ No newline at end of file + workflow_worker(worker_args[rank]) From f243bb9013cb3d399a8c8b5487ca1894f54350de Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Tue, 1 Oct 2024 14:50:53 -0700 Subject: [PATCH 02/24] Reorder imports --- metaMS/gcmsWorkflow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index 9fc9725..ee8c667 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -1,16 +1,14 @@ +import cProfile from dataclasses import dataclass from multiprocessing import Pool from pathlib import Path import toml - -from corems.mass_spectra.input.andiNetCDF import ReadAndiNetCDF from corems.encapsulation.input import parameter_from_json from corems.mass_spectra.calc.GC_RI_Calibration import get_rt_ri_pairs +from corems.mass_spectra.input.andiNetCDF import ReadAndiNetCDF from corems.molecular_id.search.compoundSearch import LowResMassSpectralMatch -import cProfile - @dataclass class WorkflowParameters: @@ -209,6 +207,7 @@ def get_gcms(file_path, corems_params): def start_sql_from_file(): from pathlib import Path + from corems.molecular_id.input.nistMSI import ReadNistMSI ref_lib_path = Path("data/PNNLMetV20191015.MSL") @@ -218,7 +217,8 @@ def start_sql_from_file(): def run_gcms_mpi(workflow_params_file, replicas, rt_ri_pairs): - import os, sys + import os + import sys sys.path.append(os.getcwd()) from mpi4py import MPI From 42249fbfc608d1772a7531d868adfb50072830e3 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Thu, 3 Oct 2024 13:29:55 -0700 Subject: [PATCH 03/24] Add docstrings, comments --- metaMS/gcmsWorkflow.py | 388 +++++++++++++++++++++++++---------------- 1 file changed, 237 insertions(+), 151 deletions(-) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index ee8c667..74fd55f 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -1,45 +1,90 @@ -import cProfile from dataclasses import dataclass from multiprocessing import Pool from pathlib import Path import toml + +from corems.mass_spectra.input.andiNetCDF import ReadAndiNetCDF from corems.encapsulation.input import parameter_from_json from corems.mass_spectra.calc.GC_RI_Calibration import get_rt_ri_pairs -from corems.mass_spectra.input.andiNetCDF import ReadAndiNetCDF from corems.molecular_id.search.compoundSearch import LowResMassSpectralMatch +import cProfile @dataclass class WorkflowParameters: - file_paths: tuple = ("data/...", "data/...") - # RI FAMES Calibration File - calibration_file_path: str = "data/..." - # Sample/Process Metadata - nmdc_metadata_path: str = "configuration/nmdc_metadata.json" - # configuration file for corems - corems_toml_path: str = "configuration/corems.toml" - output_directory: str = "data/..." - output_filename: str = "data/..." - output_type: str = "csv" + """ + Data class to establish workflow parameters. + + Parameters + ---------- + file_paths : tuple(str) + Paths to files to process. + calibration_file_path : str + FAMEs retention index calibration filepath. + nmdc_metadata_path : str + Sample and processing metadata. + corems_toml_path : str + CoreMS configuration. + output_directory : str + Path to save outputs. + output_filename : str + Output filename. + output_type : + Output extension. + + """ + + file_paths: tuple = ('data/...', 'data/...') + #RI FAMES Calibration File + calibration_file_path: str = 'data/...' + #Sample/Process Metadata + nmdc_metadata_path: str = 'configuration/nmdc_metadata.json' + #configuration file for corems + corems_toml_path: str = 'configuration/corems.toml' + output_directory: str = 'data/...' + output_filename: str = 'data/...' + output_type: str = 'csv' def worker(args): - cProfile.runctx("workflow_worker(args)", globals(), locals(), "gc-ms.prof") - - -def run_gcms_metabolomics_workflow_wdl( - file_paths, - calibration_file_path, - output_directory, - output_filename, - output_type, - corems_toml_path, - jobs, - db_path=None, -): + """ + Wraps `workflow_worker` using cProfile. + + """ + + cProfile.runctx('workflow_worker(args)', globals(), locals(), 'gc-ms.prof') + + +def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output_directory, + output_filename, output_type, corems_toml_path, jobs, db_path=None): + """ + GCMS metabolomics workflow with WDL. + + Parameters + ---------- + file_paths : tuple(str) + Paths to files to process. + calibration_file_path : str + FAMEs retention index calibration filepath. + output_directory : str + Path to save outputs. + output_filename : str + Output filename. + output_type : + Output extension. + corems_toml_path : str + CoreMS configuration. + jobs : int + Number of concurrent jobs. + [unused] db_path : str + Path to database. + + """ + import click + # Store workflow parameters workflow_params = WorkflowParameters() workflow_params.file_paths = file_paths.split(",") workflow_params.calibration_file_path = calibration_file_path @@ -48,199 +93,240 @@ def run_gcms_metabolomics_workflow_wdl( workflow_params.output_type = output_type workflow_params.corems_toml_path = corems_toml_path + # Create output directory dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) - output_path = ( - Path(workflow_params.output_directory) / workflow_params.output_filename - ) - rt_ri_pairs = get_calibration_rtri_pairs( - workflow_params.calibration_file_path, workflow_params.corems_toml_path - ) + # Determine output filepath + output_path = Path(workflow_params.output_directory)/workflow_params.output_filename - worker_args = [ - ( - file_path, - rt_ri_pairs, - workflow_params.corems_toml_path, - workflow_params.calibration_file_path, - ) - for file_path in workflow_params.file_paths - ] - # gcms_list = pool.map(workflow_worker, worker_args) - pool = Pool(int(jobs)) + # Load FAMEs calibration data + gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, + workflow_params.corems_toml_path) - for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - eval("gcms.to_" + workflow_params.output_type + "(output_path)") + # [HARDCODED] Load FAMEs calibration reference + sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") - pool.close() - pool.join() + # Compute RT:RI pairs + rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) + + # Prepare worker arguments + worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] + + # Create multiprocess pool + with Pool(int(jobs)) as pool: + + # Map workflow over inputs + for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): + eval('gcms.to_'+ workflow_params.output_type + '(output_path)') def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): + """ + NMDC metabolomics workflow. + + Parameters + ---------- + workflow_params_file : str + Path to workflow parameters file. + jobs : int + Number of concurrent jobs. + + """ + import click - dms_file_path = "db/GC-MS Metabolomics Experiments to Process Final.xlsx" + # [HARDCODED, UNUSED] Path to DMS file path? + dms_file_path = 'db/GC-MS Metabolomics Experiments to Process Final.xlsx' - click.echo("Loading Searching Settings from %s" % workflow_params_file) + # Load workflow settings + click.echo('Loading Searching Settings from %s' % workflow_params_file) workflow_params = read_workflow_parameter(workflow_params_file) + # Create output directory dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) - rt_ri_pairs = get_calibration_rtri_pairs( - workflow_params.calibration_file_path, workflow_params.corems_toml_path - ) + # Load FAMEs calibration data + gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, + workflow_params.corems_toml_path) - worker_args = [ - ( - file_path, - rt_ri_pairs, - workflow_params.corems_toml_path, - workflow_params.calibration_file_path, - ) - for file_path in workflow_params.file_paths - ] - # gcms_list = pool.map(workflow_worker, worker_args) - pool = Pool(jobs) + # [HARDCODED] Load FAMEs calibration reference + sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") - for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - in_file_path = Path(workflow_params.file_paths[i]) - output_path = Path(workflow_params.output_directory) / in_file_path.name + # Compute RT:RI pairs + rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) - eval( - "gcms.to_" - + workflow_params.output_type - + "(output_path, write_metadata=False)" - ) + # Prepare worker arguments + worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] - # nmdc = NMDC_Metadata(in_file_path, workflow_params.calibration_file_path, output_path, dms_file_path) - # nmdc.create_nmdc_metadata(gcms) + # Create multiprocess pool + with Pool(jobs) as pool: - pool.close() - pool.join() + # Map workflow over inputs + for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): + # Determine output path + input_path = Path(workflow_params.file_paths[i]) + output_path = Path(workflow_params.output_directory)/input_path.name + + eval('gcms.to_'+ workflow_params.output_type + '(output_path, write_metadata=False)') + + #nmdc = NMDC_Metadata(in_file_path, workflow_params.calibration_file_path, output_path, dms_file_path) + #nmdc.create_nmdc_metadata(gcms) + def run_gcms_metabolomics_workflow(workflow_params_file, jobs): + """ + GC/MS metabolomics workflow. + + Parameters + ---------- + workflow_params_file : str + Path to workflow parameters file. + jobs : int + Number of concurrent jobs. + + """ + import click - click.echo("Loading Searching Settings from %s" % workflow_params_file) - + # Load workflow settings + click.echo('Loading Searching Settings from %s' % workflow_params_file) workflow_params = read_workflow_parameter(workflow_params_file) + # Create output directory dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) - output_path = ( - Path(workflow_params.output_directory) / workflow_params.output_filename - ) - rt_ri_pairs = get_calibration_rtri_pairs( - workflow_params.calibration_file_path, workflow_params.corems_toml_path - ) + # Determine output filepath + output_path = Path(workflow_params.output_directory)/workflow_params.output_filename - worker_args = [ - ( - file_path, - rt_ri_pairs, - workflow_params.corems_toml_path, - workflow_params.calibration_file_path, - ) - for file_path in workflow_params.file_paths - ] - # gcms_list = pool.map(workflow_worker, worker_args) - pool = Pool(jobs) + # Load FAMEs calibration data + gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, + workflow_params.corems_toml_path) - for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - eval("gcms.to_" + workflow_params.output_type + "(output_path)") + # [HARDCODED] Load FAMEs calibration reference + sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") - pool.close() - pool.join() + # Compute RT:RI pairs + rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) + # Prepare worker arguments + worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] -def read_workflow_parameter(gcms_workflow_paramaters_toml_file): - with open(gcms_workflow_paramaters_toml_file, "r") as infile: - return WorkflowParameters(**toml.load(infile)) + # Create multiprocess pool + with Pool(jobs) as pool: + # Map workflow over inputs + for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): + eval('gcms.to_'+ workflow_params.output_type + '(output_path)') -def get_calibration_rtri_pairs(ref_file_path, corems_paramaters_toml_file): - gcms_ref_obj = get_gcms(ref_file_path, corems_paramaters_toml_file) - # sql_obj = start_sql_from_file() - # rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj,sql_obj=sql_obj) - # !!!!!! READ !!!!! use the previous two lines if db/EMSL_lowres_gcms_test_database.sqlite does not exist - # and comment the next line - rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj) - return rt_ri_pairs + +def read_workflow_parameter(path): + """ + Read workflow configuration parameters from file. + + Parameters + ---------- + path : str + Path to parameters file. + + Returns + ------- + :obj:`WorkflowParameters` + Data class containing workflow parameters. + """ + + with open(path, 'r') as infile: + return WorkflowParameters(**toml.load(infile)) def workflow_worker(args): + """ + Wrap data processing functionality for parallel execution. Loads GC data, + applies calibration, performs spectral search. + + Parameters + ---------- + args : tuple + Arguments fed to worker. + + Returns + ------- + gcms + GCMS object. + + """ + + # Unpack arguments file_path, ref_dict, corems_params, cal_file_path = args + # Load data gcms = get_gcms(file_path, corems_params) + # Calibrate retention indices gcms.calibrate_ri(ref_dict, cal_file_path) - # sql_obj = start_sql_from_file() - # lowResSearch = LowResMassSpectralMatch(gcms, sql_obj=sql_obj) - # !!!!!! READ !!!!! use the previous two lines if db/pnnl_lowres_gcms_compounds.sqlite does not exist - # and comment the next line - lowResSearch = LowResMassSpectralMatch(gcms) + # [HARDCODED] Load reference database + sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_Library_EILowRes_20240816.db") + + # Perform search + lowResSearch = LowResMassSpectralMatch(gcms, sql_obj=sql_obj) lowResSearch.run() return gcms - def get_gcms(file_path, corems_params): + """ + Convenience function to load and process file according to CoreMS configuration + parameters. + + Parameters + ---------- + + """ + + # Read NetCDF file reader_gcms = ReadAndiNetCDF(file_path) + # Process data reader_gcms.run() + # Export to GCMS object gcms = reader_gcms.get_gcms_obj() - parameter_from_json.load_and_set_toml_parameters_gcms( - gcms, parameters_path=corems_params - ) + # Set parameters from file + parameter_from_json.load_and_set_toml_parameters_gcms(gcms, parameters_path=corems_params) + # Process chromatogram gcms.process_chromatogram() return gcms -def start_sql_from_file(): - from pathlib import Path +# def run_gcms_mpi(workflow_params_file, replicas, rt_ri_pairs): + +# import os, sys +# sys.path.append(os.getcwd()) +# from mpi4py import MPI + +# workflow_params = read_workflow_parameter(workflow_params_file) - from corems.molecular_id.input.nistMSI import ReadNistMSI +# gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, +# workflow_params.corems_toml_path) +# sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") +# rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) - ref_lib_path = Path("data/PNNLMetV20191015.MSL") - if ref_lib_path.exists: - sql_obj = ReadNistMSI(ref_lib_path).get_sqlLite_obj() - return sql_obj +# worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] +# comm = MPI.COMM_WORLD +# rank = comm.Get_rank() +# size = comm.Get_size() + +# # will only run tasks up to the number of files paths selected in the EnviroMS File +# if rank < len(worker_args): +# workflow_worker(worker_args[rank]) -def run_gcms_mpi(workflow_params_file, replicas, rt_ri_pairs): - import os - import sys - sys.path.append(os.getcwd()) - from mpi4py import MPI - - workflow_params = read_workflow_parameter(workflow_params_file) - rt_ri_pairs = get_calibration_rtri_pairs( - workflow_params.calibration_file_path, workflow_params.corems_toml_path - ) - worker_args = [ - ( - file_path, - rt_ri_pairs, - workflow_params.corems_toml_path, - workflow_params.calibration_file_path, - ) - for file_path in workflow_params.file_paths - ] - - comm = MPI.COMM_WORLD - rank = comm.Get_rank() - size = comm.Get_size() - - # will only run tasks up to the number of files paths selected in the EnviroMS File - if rank < len(worker_args): - workflow_worker(worker_args[rank]) +# if \ No newline at end of file From 9af4c96483c538b5ef44ee42625d1c4332fde43e Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Thu, 3 Oct 2024 13:32:36 -0700 Subject: [PATCH 04/24] Enforce format via linter --- metaMS/gcmsWorkflow.py | 166 ++++++++++++++++++++++++++--------------- 1 file changed, 104 insertions(+), 62 deletions(-) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index 74fd55f..60bf64c 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -1,15 +1,15 @@ +import cProfile from dataclasses import dataclass from multiprocessing import Pool from pathlib import Path import toml - -from corems.mass_spectra.input.andiNetCDF import ReadAndiNetCDF from corems.encapsulation.input import parameter_from_json from corems.mass_spectra.calc.GC_RI_Calibration import get_rt_ri_pairs +from corems.mass_spectra.input.andiNetCDF import ReadAndiNetCDF +from corems.molecular_id.factory.EI_SQL import EI_LowRes_SQLite from corems.molecular_id.search.compoundSearch import LowResMassSpectralMatch -import cProfile @dataclass class WorkflowParameters: @@ -30,21 +30,21 @@ class WorkflowParameters: Path to save outputs. output_filename : str Output filename. - output_type : + output_type : Output extension. """ - - file_paths: tuple = ('data/...', 'data/...') - #RI FAMES Calibration File - calibration_file_path: str = 'data/...' - #Sample/Process Metadata - nmdc_metadata_path: str = 'configuration/nmdc_metadata.json' - #configuration file for corems - corems_toml_path: str = 'configuration/corems.toml' - output_directory: str = 'data/...' - output_filename: str = 'data/...' - output_type: str = 'csv' + + file_paths: tuple = ("data/...", "data/...") + # RI FAMES Calibration File + calibration_file_path: str = "data/..." + # Sample/Process Metadata + nmdc_metadata_path: str = "configuration/nmdc_metadata.json" + # configuration file for corems + corems_toml_path: str = "configuration/corems.toml" + output_directory: str = "data/..." + output_filename: str = "data/..." + output_type: str = "csv" def worker(args): @@ -52,12 +52,20 @@ def worker(args): Wraps `workflow_worker` using cProfile. """ - - cProfile.runctx('workflow_worker(args)', globals(), locals(), 'gc-ms.prof') + + cProfile.runctx("workflow_worker(args)", globals(), locals(), "gc-ms.prof") -def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output_directory, - output_filename, output_type, corems_toml_path, jobs, db_path=None): +def run_gcms_metabolomics_workflow_wdl( + file_paths, + calibration_file_path, + output_directory, + output_filename, + output_type, + corems_toml_path, + jobs, + db_path=None, +): """ GCMS metabolomics workflow with WDL. @@ -71,7 +79,7 @@ def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output Path to save outputs. output_filename : str Output filename. - output_type : + output_type : Output extension. corems_toml_path : str CoreMS configuration. @@ -81,7 +89,7 @@ def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output Path to database. """ - + import click # Store workflow parameters @@ -98,11 +106,14 @@ def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output dirloc.mkdir(exist_ok=True) # Determine output filepath - output_path = Path(workflow_params.output_directory)/workflow_params.output_filename + output_path = ( + Path(workflow_params.output_directory) / workflow_params.output_filename + ) # Load FAMEs calibration data - gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, - workflow_params.corems_toml_path) + gcms_ref_obj = get_gcms( + workflow_params.calibration_file_path, workflow_params.corems_toml_path + ) # [HARDCODED] Load FAMEs calibration reference sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") @@ -110,15 +121,22 @@ def run_gcms_metabolomics_workflow_wdl(file_paths, calibration_file_path, output # Compute RT:RI pairs rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) - # Prepare worker arguments - worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] + # Prepare worker arguments + worker_args = [ + ( + file_path, + rt_ri_pairs, + workflow_params.corems_toml_path, + workflow_params.calibration_file_path, + ) + for file_path in workflow_params.file_paths + ] # Create multiprocess pool with Pool(int(jobs)) as pool: - # Map workflow over inputs for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - eval('gcms.to_'+ workflow_params.output_type + '(output_path)') + eval("gcms.to_" + workflow_params.output_type + "(output_path)") def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): @@ -133,14 +151,14 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): Number of concurrent jobs. """ - + import click # [HARDCODED, UNUSED] Path to DMS file path? - dms_file_path = 'db/GC-MS Metabolomics Experiments to Process Final.xlsx' + dms_file_path = "db/GC-MS Metabolomics Experiments to Process Final.xlsx" # Load workflow settings - click.echo('Loading Searching Settings from %s' % workflow_params_file) + click.echo("Loading Searching Settings from %s" % workflow_params_file) workflow_params = read_workflow_parameter(workflow_params_file) # Create output directory @@ -148,8 +166,9 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): dirloc.mkdir(exist_ok=True) # Load FAMEs calibration data - gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, - workflow_params.corems_toml_path) + gcms_ref_obj = get_gcms( + workflow_params.calibration_file_path, workflow_params.corems_toml_path + ) # [HARDCODED] Load FAMEs calibration reference sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") @@ -158,23 +177,33 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) # Prepare worker arguments - worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] + worker_args = [ + ( + file_path, + rt_ri_pairs, + workflow_params.corems_toml_path, + workflow_params.calibration_file_path, + ) + for file_path in workflow_params.file_paths + ] # Create multiprocess pool with Pool(jobs) as pool: - # Map workflow over inputs for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - # Determine output path input_path = Path(workflow_params.file_paths[i]) - output_path = Path(workflow_params.output_directory)/input_path.name - - eval('gcms.to_'+ workflow_params.output_type + '(output_path, write_metadata=False)') - - #nmdc = NMDC_Metadata(in_file_path, workflow_params.calibration_file_path, output_path, dms_file_path) - #nmdc.create_nmdc_metadata(gcms) - + output_path = Path(workflow_params.output_directory) / input_path.name + + eval( + "gcms.to_" + + workflow_params.output_type + + "(output_path, write_metadata=False)" + ) + + # nmdc = NMDC_Metadata(in_file_path, workflow_params.calibration_file_path, output_path, dms_file_path) + # nmdc.create_nmdc_metadata(gcms) + def run_gcms_metabolomics_workflow(workflow_params_file, jobs): """ @@ -188,11 +217,11 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): Number of concurrent jobs. """ - + import click # Load workflow settings - click.echo('Loading Searching Settings from %s' % workflow_params_file) + click.echo("Loading Searching Settings from %s" % workflow_params_file) workflow_params = read_workflow_parameter(workflow_params_file) # Create output directory @@ -200,11 +229,14 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): dirloc.mkdir(exist_ok=True) # Determine output filepath - output_path = Path(workflow_params.output_directory)/workflow_params.output_filename + output_path = ( + Path(workflow_params.output_directory) / workflow_params.output_filename + ) # Load FAMEs calibration data - gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, - workflow_params.corems_toml_path) + gcms_ref_obj = get_gcms( + workflow_params.calibration_file_path, workflow_params.corems_toml_path + ) # [HARDCODED] Load FAMEs calibration reference sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") @@ -213,14 +245,21 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) # Prepare worker arguments - worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] + worker_args = [ + ( + file_path, + rt_ri_pairs, + workflow_params.corems_toml_path, + workflow_params.calibration_file_path, + ) + for file_path in workflow_params.file_paths + ] # Create multiprocess pool with Pool(jobs) as pool: - # Map workflow over inputs for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): - eval('gcms.to_'+ workflow_params.output_type + '(output_path)') + eval("gcms.to_" + workflow_params.output_type + "(output_path)") def read_workflow_parameter(path): @@ -238,8 +277,8 @@ def read_workflow_parameter(path): Data class containing workflow parameters. """ - with open(path, 'r') as infile: - return WorkflowParameters(**toml.load(infile)) + with open(path, "r") as infile: + return WorkflowParameters(**toml.load(infile)) def workflow_worker(args): @@ -256,7 +295,7 @@ def workflow_worker(args): ------- gcms GCMS object. - + """ # Unpack arguments @@ -270,13 +309,14 @@ def workflow_worker(args): # [HARDCODED] Load reference database sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_Library_EILowRes_20240816.db") - + # Perform search lowResSearch = LowResMassSpectralMatch(gcms, sql_obj=sql_obj) lowResSearch.run() return gcms + def get_gcms(file_path, corems_params): """ Convenience function to load and process file according to CoreMS configuration @@ -284,7 +324,7 @@ def get_gcms(file_path, corems_params): Parameters ---------- - + """ # Read NetCDF file @@ -297,7 +337,9 @@ def get_gcms(file_path, corems_params): gcms = reader_gcms.get_gcms_obj() # Set parameters from file - parameter_from_json.load_and_set_toml_parameters_gcms(gcms, parameters_path=corems_params) + parameter_from_json.load_and_set_toml_parameters_gcms( + gcms, parameters_path=corems_params + ) # Process chromatogram gcms.process_chromatogram() @@ -306,11 +348,11 @@ def get_gcms(file_path, corems_params): # def run_gcms_mpi(workflow_params_file, replicas, rt_ri_pairs): - + # import os, sys -# sys.path.append(os.getcwd()) +# sys.path.append(os.getcwd()) # from mpi4py import MPI - + # workflow_params = read_workflow_parameter(workflow_params_file) # gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, @@ -323,10 +365,10 @@ def get_gcms(file_path, corems_params): # comm = MPI.COMM_WORLD # rank = comm.Get_rank() # size = comm.Get_size() - + # # will only run tasks up to the number of files paths selected in the EnviroMS File # if rank < len(worker_args): # workflow_worker(worker_args[rank]) -# if \ No newline at end of file +# if From 2c12c317016864bbeff3413f941226ebc2a965f5 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 4 Oct 2024 10:07:46 -0700 Subject: [PATCH 05/24] Update FAMEs indexing to match MetabRef --- configuration/corems.toml | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/configuration/corems.toml b/configuration/corems.toml index 9a61ded..c832d4f 100644 --- a/configuration/corems.toml +++ b/configuration/corems.toml @@ -1,12 +1,25 @@ [MolecularSearch] -url_database = "sqlite:////metams/db/pnnl_lowres_gcms_compounds.sqlite" +url_database: "sqlite:///db/MetabRef_Library_EILowRes_20240816.db" +url_calibration: "sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db" ri_search_range = 35 rt_search_range = 1.0 correlation_threshold = 0.5 score_threshold = 0.0 ri_spacing = 200.0 ri_std = 3.0 -ri_calibration_compound_names = [ " [C8] Methyl Caprylate [7.812]", " [C10] Methyl Caprate [10.647]", " [C9] Methyl Pelargonate [9.248]", " [C12] Methyl Laurate [13.250]", " [C14] Methyl Myristate [15.597]", " [C16] Methyl Palmitate [17.723]", " [C18] Methyl Stearate [19.663]", " [C20] Methyl Eicosanoate [21.441]", " [C22] Methyl Docosanoate [23.082]", " [C24] Methyl Linocerate [24.603]", " [C26] Methyl Hexacosanoate [26.023]", " [C28] Methyl Octacosanoate [27.349]", " [C30] Methyl Triacontanoate [28.72]",] +ri_calibration_compound_names = ['Methyl Caprylate', + 'Methyl Caprate', + 'Methyl Pelargonate', + 'Methyl Laurate', + 'Methyl Myristate', + 'Methyl Palmitate', + 'Methyl Stearate', + 'Methyl Eicosanoate', + 'Methyl Docosanoate', + 'Methyl Linocerate', + 'Methyl Hexacosanoate', + 'Methyl Octacosanoate', + 'Methyl Triacontanoate'] exploratory_mode = false score_methods = [ "highest_sim_score", "highest_ss",] output_score_method = "All" From 67d40b58a2ef061060a4fc5900b46fd5ce79e509 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 4 Oct 2024 10:08:17 -0700 Subject: [PATCH 06/24] Modify to load CoreMS parameters directly --- metaMS/gcmsWorkflow.py | 118 +++++++++++++++++++++++++++++++---------- 1 file changed, 90 insertions(+), 28 deletions(-) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index 60bf64c..d9cdb2a 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -101,6 +101,10 @@ def run_gcms_metabolomics_workflow_wdl( workflow_params.output_type = output_type workflow_params.corems_toml_path = corems_toml_path + # Load CoreMS settings + click.echo("Loading CoreMS settings from %s" % workflow_params.corems_toml_path) + corems_params = load_corems_parameters(workflow_params.corems_toml_path) + # Create output directory dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) @@ -111,15 +115,17 @@ def run_gcms_metabolomics_workflow_wdl( ) # Load FAMEs calibration data - gcms_ref_obj = get_gcms( + gcms_cal_obj = get_gcms( workflow_params.calibration_file_path, workflow_params.corems_toml_path ) - # [HARDCODED] Load FAMEs calibration reference - sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") + # Load FAMEs calibration reference + fames_ref_sql = EI_LowRes_SQLite( + url=corems_params["MolecularSearch"]["url_calibration"] + ) # Compute RT:RI pairs - rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) + rt_ri_pairs = get_rt_ri_pairs(gcms_cal_obj, sql_obj=fames_ref_sql) # Prepare worker arguments worker_args = [ @@ -158,23 +164,29 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): dms_file_path = "db/GC-MS Metabolomics Experiments to Process Final.xlsx" # Load workflow settings - click.echo("Loading Searching Settings from %s" % workflow_params_file) - workflow_params = read_workflow_parameter(workflow_params_file) + click.echo("Loading search settings from %s" % workflow_params_file) + workflow_params = load_workflow_parameters(workflow_params_file) + + # Load CoreMS settings + click.echo("Loading CoreMS settings from %s" % workflow_params.corems_toml_path) + corems_params = load_corems_parameters(workflow_params.corems_toml_path) # Create output directory dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) # Load FAMEs calibration data - gcms_ref_obj = get_gcms( + gcms_cal_obj = get_gcms( workflow_params.calibration_file_path, workflow_params.corems_toml_path ) - # [HARDCODED] Load FAMEs calibration reference - sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") + # Load FAMEs calibration reference + fames_ref_sql = EI_LowRes_SQLite( + url=corems_params["MolecularSearch"]["url_calibration"] + ) # Compute RT:RI pairs - rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) + rt_ri_pairs = get_rt_ri_pairs(gcms_cal_obj, sql_obj=fames_ref_sql) # Prepare worker arguments worker_args = [ @@ -221,8 +233,12 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): import click # Load workflow settings - click.echo("Loading Searching Settings from %s" % workflow_params_file) - workflow_params = read_workflow_parameter(workflow_params_file) + click.echo("Loading search settings from %s" % workflow_params_file) + workflow_params = load_workflow_parameters(workflow_params_file) + + # Load CoreMS settings + click.echo("Loading CoreMS settings from %s" % workflow_params.corems_toml_path) + corems_params = load_corems_parameters(workflow_params.corems_toml_path) # Create output directory dirloc = Path(workflow_params.output_directory) @@ -234,15 +250,17 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): ) # Load FAMEs calibration data - gcms_ref_obj = get_gcms( + gcms_cal_obj = get_gcms( workflow_params.calibration_file_path, workflow_params.corems_toml_path ) - # [HARDCODED] Load FAMEs calibration reference - sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") + # Load FAMEs calibration reference + fames_ref_sql = EI_LowRes_SQLite( + url=corems_params["MolecularSearch"]["url_calibration"] + ) # Compute RT:RI pairs - rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) + rt_ri_pairs = get_rt_ri_pairs(gcms_cal_obj, sql_obj=fames_ref_sql) # Prepare worker arguments worker_args = [ @@ -262,9 +280,29 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): eval("gcms.to_" + workflow_params.output_type + "(output_path)") -def read_workflow_parameter(path): +def read_toml(path): + """ + Read TOML file. + + Parameters + ---------- + path : str + Path to TOML file. + + Returns + ------- + dict + Dictionary of parameter:value pairs. + + """ + + with open(path, "r", encoding="utf8") as stream: + return toml.load(stream) + + +def load_workflow_parameters(path): """ - Read workflow configuration parameters from file. + Load workflow configuration parameters from file. Parameters ---------- @@ -275,10 +313,29 @@ def read_workflow_parameter(path): ------- :obj:`WorkflowParameters` Data class containing workflow parameters. + + """ + + return WorkflowParameters(**read_toml(path)) + + +def load_corems_parameters(path): + """ + Load workflow configuration parameters from file. + + Parameters + ---------- + path : str + Path to parameters file. + + Returns + ------- + dict + Dictionary of parameter:value pairs. + """ - with open(path, "r") as infile: - return WorkflowParameters(**toml.load(infile)) + return read_toml(path) def workflow_worker(args): @@ -299,19 +356,24 @@ def workflow_worker(args): """ # Unpack arguments - file_path, ref_dict, corems_params, cal_file_path = args + file_path, ref_dict, corems_params_file, cal_file_path = args + + # Load CoreMS parameters + corems_parameters = load_corems_parameters(corems_params_file) # Load data - gcms = get_gcms(file_path, corems_params) + gcms = get_gcms(file_path, corems_params_file) # Calibrate retention indices gcms.calibrate_ri(ref_dict, cal_file_path) - # [HARDCODED] Load reference database - sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_Library_EILowRes_20240816.db") + # Load reference database + ref_db_sql = EI_LowRes_SQLite( + url=corems_parameters["MolecularSearch"]["url_database"] + ) # Perform search - lowResSearch = LowResMassSpectralMatch(gcms, sql_obj=sql_obj) + lowResSearch = LowResMassSpectralMatch(gcms, sql_obj=ref_db_sql) lowResSearch.run() return gcms @@ -353,12 +415,12 @@ def get_gcms(file_path, corems_params): # sys.path.append(os.getcwd()) # from mpi4py import MPI -# workflow_params = read_workflow_parameter(workflow_params_file) +# workflow_params = load_workflow_parameters(workflow_params_file) -# gcms_ref_obj = get_gcms(workflow_params.calibration_file_path, +# gcms_cal_obj = get_gcms(workflow_params.calibration_file_path, # workflow_params.corems_toml_path) # sql_obj = EI_LowRes_SQLite(url="sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db") -# rt_ri_pairs = get_rt_ri_pairs(gcms_ref_obj, sql_obj=sql_obj) +# rt_ri_pairs = get_rt_ri_pairs(gcms_cal_obj, sql_obj=sql_obj) # worker_args = [(file_path, rt_ri_pairs, workflow_params.corems_toml_path, workflow_params.calibration_file_path) for file_path in workflow_params.file_paths] From db637b3379da478c207e5942d567830a2bd1d0fe Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 4 Oct 2024 10:33:04 -0700 Subject: [PATCH 07/24] Remove unused import --- metaMS/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/metaMS/cli.py b/metaMS/cli.py index 4e991b0..b1b8f4a 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -1,4 +1,3 @@ -from multiprocessing import Pool from pathlib import Path import click From 4a0c61f6ecdb437c19de92e7c971827a2e073880 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Mon, 7 Oct 2024 15:45:38 -0700 Subject: [PATCH 08/24] Move FAMEs reference to MetaMS configuration file --- configuration/corems.toml | 1 - configuration/metams.toml | 1 + metaMS/cli.py | 3 +++ metaMS/gcmsWorkflow.py | 28 ++++++++++++++-------------- wdl/metaMS.wdl | 2 ++ wdl/metams_input.json | 3 +-- 6 files changed, 21 insertions(+), 17 deletions(-) diff --git a/configuration/corems.toml b/configuration/corems.toml index c832d4f..a3629cb 100644 --- a/configuration/corems.toml +++ b/configuration/corems.toml @@ -1,6 +1,5 @@ [MolecularSearch] url_database: "sqlite:///db/MetabRef_Library_EILowRes_20240816.db" -url_calibration: "sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db" ri_search_range = 35 rt_search_range = 1.0 correlation_threshold = 0.5 diff --git a/configuration/metams.toml b/configuration/metams.toml index c92ef11..4a311bc 100644 --- a/configuration/metams.toml +++ b/configuration/metams.toml @@ -1,4 +1,5 @@ file_paths = [ "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf"] +calibration_reference_path = "sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db" calibration_file_path = "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" corems_toml_path = "configuration/corems.toml" nmdc_metadata_path = "configuration/nmdc_metadata.json" diff --git a/metaMS/cli.py b/metaMS/cli.py index b1b8f4a..36d7d0e 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -20,6 +20,7 @@ def cli(): @cli.command() @click.argument("file_paths", required=True, type=str) +@click.argument("calibration_reference_path", required=True, type=str) @click.argument("calibration_file_path", required=True, type=str) @click.argument("output_directory", required=True, type=str) @click.argument("output_filename", required=True, type=str) @@ -29,6 +30,7 @@ def cli(): @click.option("--jobs", "-j", default=4, help="'cpu's'") def run_gcms_wdl_workflow( file_paths, + calibration_reference_path, calibration_file_path, output_directory, output_filename, @@ -46,6 +48,7 @@ def run_gcms_wdl_workflow( click.echo("Running gcms workflow") run_gcms_metabolomics_workflow_wdl( file_paths, + calibration_reference_path, calibration_file_path, output_directory, output_filename, diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index d9cdb2a..b19c9be 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -35,12 +35,17 @@ class WorkflowParameters: """ + # Filepaths to process file_paths: tuple = ("data/...", "data/...") - # RI FAMES Calibration File + + # RI FAMEs calibration files + calibration_reference_path: str = "data/..." calibration_file_path: str = "data/..." + # Sample/Process Metadata nmdc_metadata_path: str = "configuration/nmdc_metadata.json" - # configuration file for corems + + # Configuration file for corems corems_toml_path: str = "configuration/corems.toml" output_directory: str = "data/..." output_filename: str = "data/..." @@ -58,6 +63,7 @@ def worker(args): def run_gcms_metabolomics_workflow_wdl( file_paths, + calibration_reference_path, calibration_file_path, output_directory, output_filename, @@ -73,6 +79,8 @@ def run_gcms_metabolomics_workflow_wdl( ---------- file_paths : tuple(str) Paths to files to process. + calibration_reference_path : str + FAMEs retention index calibration reference filepath. calibration_file_path : str FAMEs retention index calibration filepath. output_directory : str @@ -95,6 +103,7 @@ def run_gcms_metabolomics_workflow_wdl( # Store workflow parameters workflow_params = WorkflowParameters() workflow_params.file_paths = file_paths.split(",") + workflow_params.calibration_reference_path = calibration_reference_path workflow_params.calibration_file_path = calibration_file_path workflow_params.output_directory = output_directory workflow_params.output_filename = output_filename @@ -103,7 +112,6 @@ def run_gcms_metabolomics_workflow_wdl( # Load CoreMS settings click.echo("Loading CoreMS settings from %s" % workflow_params.corems_toml_path) - corems_params = load_corems_parameters(workflow_params.corems_toml_path) # Create output directory dirloc = Path(workflow_params.output_directory) @@ -121,7 +129,7 @@ def run_gcms_metabolomics_workflow_wdl( # Load FAMEs calibration reference fames_ref_sql = EI_LowRes_SQLite( - url=corems_params["MolecularSearch"]["url_calibration"] + url=workflow_params.calibration_reference_path ) # Compute RT:RI pairs @@ -167,10 +175,6 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): click.echo("Loading search settings from %s" % workflow_params_file) workflow_params = load_workflow_parameters(workflow_params_file) - # Load CoreMS settings - click.echo("Loading CoreMS settings from %s" % workflow_params.corems_toml_path) - corems_params = load_corems_parameters(workflow_params.corems_toml_path) - # Create output directory dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) @@ -182,7 +186,7 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): # Load FAMEs calibration reference fames_ref_sql = EI_LowRes_SQLite( - url=corems_params["MolecularSearch"]["url_calibration"] + url=workflow_params.calibration_reference_path ) # Compute RT:RI pairs @@ -236,10 +240,6 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): click.echo("Loading search settings from %s" % workflow_params_file) workflow_params = load_workflow_parameters(workflow_params_file) - # Load CoreMS settings - click.echo("Loading CoreMS settings from %s" % workflow_params.corems_toml_path) - corems_params = load_corems_parameters(workflow_params.corems_toml_path) - # Create output directory dirloc = Path(workflow_params.output_directory) dirloc.mkdir(exist_ok=True) @@ -256,7 +256,7 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): # Load FAMEs calibration reference fames_ref_sql = EI_LowRes_SQLite( - url=corems_params["MolecularSearch"]["url_calibration"] + url=workflow_params.calibration_reference_path ) # Compute RT:RI pairs diff --git a/wdl/metaMS.wdl b/wdl/metaMS.wdl index 2022b61..32d3fdb 100644 --- a/wdl/metaMS.wdl +++ b/wdl/metaMS.wdl @@ -13,6 +13,7 @@ workflow gcmsMetabolomics { task runMetaMS { input { Array[File] file_paths + File calibration_reference_path File calibration_file_path String output_directory String output_filename @@ -25,6 +26,7 @@ task runMetaMS { command { metaMS run-gcms-wdl-workflow \ ${sep=',' file_paths} \ + ${calibration_reference_path} \ ${calibration_file_path} \ ${output_directory} \ ${output_filename} \ diff --git a/wdl/metams_input.json b/wdl/metams_input.json index 4479f2a..4534268 100644 --- a/wdl/metams_input.json +++ b/wdl/metams_input.json @@ -3,7 +3,7 @@ "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" ], - + "gcmsMetabolomics.runMetaMS.calibration_reference_path": "sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db", "gcmsMetabolomics.runMetaMS.calibration_file_path": "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "gcmsMetabolomics.runMetaMS.output_directory": "test_output", "gcmsMetabolomics.runMetaMS.output_filename": "test_dataset", @@ -11,5 +11,4 @@ "gcmsMetabolomics.runMetaMS.corems_toml_path": "./configuration/corems.toml", "gcmsMetabolomics.runMetaMS.nmdc_metadata_path": "./configuration/nmdc_metadata.json", "gcmsMetabolomics.runMetaMS.jobs_count": 4 - } \ No newline at end of file From 69cd0f7f47146bfa036b2d78fe18a9f5fa70f81d Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 11 Oct 2024 14:45:00 -0700 Subject: [PATCH 09/24] Add FAMEs reference database to workflow parameters template --- metaMS/gcmsWorkflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index b19c9be..18c9754 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -20,6 +20,8 @@ class WorkflowParameters: ---------- file_paths : tuple(str) Paths to files to process. + calibration_reference_path : str + FAMEs retention index reference SQLite database. calibration_file_path : str FAMEs retention index calibration filepath. nmdc_metadata_path : str From 600cea0d2f099fb0df4a1078547bdb07cf2138d8 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 11 Oct 2024 14:45:28 -0700 Subject: [PATCH 10/24] Update CoreMS configuration to version 2.* compliance --- configuration/corems.toml | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/configuration/corems.toml b/configuration/corems.toml index a3629cb..0c6d1ed 100644 --- a/configuration/corems.toml +++ b/configuration/corems.toml @@ -1,24 +1,26 @@ [MolecularSearch] -url_database: "sqlite:///db/MetabRef_Library_EILowRes_20240816.db" -ri_search_range = 35 +url_database = "sqlite:///db/MetabRef_Library_EILowRes_20240816.db" +ri_search_range = 35.0 rt_search_range = 1.0 correlation_threshold = 0.5 score_threshold = 0.0 ri_spacing = 200.0 ri_std = 3.0 -ri_calibration_compound_names = ['Methyl Caprylate', - 'Methyl Caprate', - 'Methyl Pelargonate', - 'Methyl Laurate', - 'Methyl Myristate', - 'Methyl Palmitate', - 'Methyl Stearate', - 'Methyl Eicosanoate', - 'Methyl Docosanoate', - 'Methyl Linocerate', - 'Methyl Hexacosanoate', - 'Methyl Octacosanoate', - 'Methyl Triacontanoate'] +ri_calibration_compound_names = [ + "Methyl Caprylate", + "Methyl Caprate", + "Methyl Pelargonate", + "Methyl Laurate", + "Methyl Myristate", + "Methyl Palmitate", + "Methyl Stearate", + "Methyl Eicosanoate", + "Methyl Docosanoate", + "Methyl Linocerate", + "Methyl Hexacosanoate", + "Methyl Octacosanoate", + "Methyl Triacontanoate", +] exploratory_mode = false score_methods = [ "highest_sim_score", "highest_ss",] output_score_method = "All" @@ -29,12 +31,13 @@ implemented_smooth_method = [ "savgol", "hanning", "blackman", "bartlett", "flat smooth_window = 5 smooth_method = "savgol" savgol_pol_order = 2 +peak_derivative_threshold = 0.0005 peak_height_max_percent = 10.0 peak_max_prominence_percent = 1.0 min_peak_datapoints = 5.0 max_peak_width = 0.1 noise_threshold_method = "manual_relative_abundance" -implemented_noise_threshold_methods = [ "auto_relative_abundance", "manual_relative_abundance", "second_derivative",] +noise_threshold_methods_implemented = [ "auto_relative_abundance", "manual_relative_abundance", "second_derivative",] std_noise_threshold = 3 peak_height_min_percent = 0.1 peak_min_prominence_percent = 0.1 From 4ff046000b4e827df569a0fbb10a2d96ebec9ca7 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Mon, 14 Oct 2024 09:42:58 -0700 Subject: [PATCH 11/24] Update base image --- Dockerfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4480941..0c1fd3e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,7 @@ -FROM corilo/corems:base-mono-pythonnet +FROM jcarr87/corems-base-py3.10 WORKDIR /metams COPY metaMS/ /metams/metaMS/ COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/ COPY db/ /metams/db/ RUN pip3 install --editable . - - - From 0554ce4a080a9bab1e634a98b3087bc0ffce1edf Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Wed, 13 Nov 2024 15:03:16 -0800 Subject: [PATCH 12/24] Update CoreMS version --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 3ee2127..464f194 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -corems>=1.5.0 +corems>=2.1.0 Click>=7.1.1 requests nmdc-schema>=7.0.0 From db5328f7b9a756d9b3acbd761da89f5e71c26e39 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 22 Nov 2024 12:08:14 -0800 Subject: [PATCH 13/24] Update workflow to build and test against WDL --- .github/workflows/wdl_checker.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wdl_checker.yml b/.github/workflows/wdl_checker.yml index f01f235..54499dd 100644 --- a/.github/workflows/wdl_checker.yml +++ b/.github/workflows/wdl_checker.yml @@ -17,12 +17,26 @@ jobs: with: python-version: '3.8' # specify the Python version you need + - name: Install Docker + run: | + curl -fsSL https://get.docker.com -o get-docker.sh + sh get-docker.sh + sudo usermod -aG docker $USER + + - name: Build Docker Image + run: | + docker build -t local-metams:latest . + - name: Install MiniWDL run: | python -m pip install --upgrade pip pip install miniwdl - - name: Run MiniWDL + - name: Test Lipid MiniWDL + run: | + miniwdl run wdl/metaMS_lipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files + + - name: Test GCMS MiniWDL run: | # Add the commands to run your MiniWDL workflow - miniwdl run wdl/metaMS.wdl -i wdl/metams_input.json --verbose --no-cache --copy-input-files \ No newline at end of file + miniwdl run wdl/metaMS_gcms.wdl -i wdl/metams_input_gcms.json --verbose --no-cache --copy-input-files From 54ab91ff759631470c4688151848dbf2954d7964 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 22 Nov 2024 12:08:44 -0800 Subject: [PATCH 14/24] Update to build from base python image --- Dockerfile | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0c1fd3e..481ad02 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,25 @@ -FROM jcarr87/corems-base-py3.10 -WORKDIR /metams +# Python base image +FROM python:3.11.1-bullseye + +# Mono: 6.12 +RUN apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys 3FA7E0328081BFF6A14DA29AA6A19B38D3D831EF \ + && echo "deb http://download.mono-project.com/repo/debian buster/snapshots/6.12 main" > /etc/apt/sources.list.d/mono-official.list \ + && apt-get update \ + && apt-get install -y clang \ + && apt-get install -y mono-devel=6.12\* \ + && rm -rf /var/lib/apt/lists/* /tmp/* + +# Pythonnet: 3.0.1 (from PyPI) +# Note: pycparser must be installed before pythonnet can be built +RUN pip install pycparser \ + && pip install pythonnet==3.0.1 + +# Copy MetaMS contents +WORKDIR /metams COPY metaMS/ /metams/metaMS/ COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/ COPY db/ /metams/db/ + +# Install the MetaMS package in editable mode RUN pip3 install --editable . From 803987b21433e8dcab71c303e2ce48c79cb57bd4 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 22 Nov 2024 12:11:24 -0800 Subject: [PATCH 15/24] Change docker image to local build --- wdl/metaMS.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/wdl/metaMS.wdl b/wdl/metaMS.wdl index 32d3fdb..3e2af8b 100644 --- a/wdl/metaMS.wdl +++ b/wdl/metaMS.wdl @@ -43,6 +43,7 @@ task runMetaMS { } runtime { - docker: "microbiomedata/metams:2.2.2" + docker: "local-metams:latest" + #TODO KRH: Change to dockerhub version after we've pushed the updated image } } \ No newline at end of file From d58bc20f931f8fada7c95cbfab44ffbca01849c4 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 22 Nov 2024 12:38:33 -0800 Subject: [PATCH 16/24] Update configuration path names --- configuration/gcms_metams.toml | 2 +- wdl/metams_input_gcms.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configuration/gcms_metams.toml b/configuration/gcms_metams.toml index 4a311bc..307b35e 100644 --- a/configuration/gcms_metams.toml +++ b/configuration/gcms_metams.toml @@ -1,7 +1,7 @@ file_paths = [ "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf"] calibration_reference_path = "sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db" calibration_file_path = "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" -corems_toml_path = "configuration/corems.toml" +corems_toml_path = "configuration/gcms_corems.toml" nmdc_metadata_path = "configuration/nmdc_metadata.json" output_directory = "output" output_filename = "output" diff --git a/wdl/metams_input_gcms.json b/wdl/metams_input_gcms.json index 109069f..ac155ab 100644 --- a/wdl/metams_input_gcms.json +++ b/wdl/metams_input_gcms.json @@ -8,7 +8,7 @@ "gcmsMetabolomics.runMetaMS.output_directory": "test_output", "gcmsMetabolomics.runMetaMS.output_filename": "test_dataset", "gcmsMetabolomics.runMetaMS.output_type": "csv", -"gcmsMetabolomics.runMetaMS.corems_toml_path": "./configuration/corems.toml", +"gcmsMetabolomics.runMetaMS.corems_toml_path": "./configuration/gcms_corems.toml", "gcmsMetabolomics.runMetaMS.nmdc_metadata_path": "./configuration/nmdc_metadata.json", "gcmsMetabolomics.runMetaMS.jobs_count": 4 } \ No newline at end of file From 570490675ac6cb5f2aaa30a518f8b6715b7eecda Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 13 Dec 2024 10:43:21 -0800 Subject: [PATCH 17/24] Remove hard version specification --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index edeb4e1..8a2bc52 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -corems==3.0.2 +corems>=3.0.2 Click>=7.1.1 requests nmdc-schema>=7.0.0 \ No newline at end of file From ad96cc31c634a66025845ab976c3a3408fcf84ce Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 13 Dec 2024 13:02:27 -0800 Subject: [PATCH 18/24] Fix enumeration in NMDC subworkflow --- metaMS/gcmsWorkflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index 18c9754..6bbcdcd 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -208,7 +208,7 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): # Create multiprocess pool with Pool(jobs) as pool: # Map workflow over inputs - for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args), 1): + for i, gcms in enumerate(pool.imap_unordered(workflow_worker, worker_args)): # Determine output path input_path = Path(workflow_params.file_paths[i]) output_path = Path(workflow_params.output_directory) / input_path.name From f2a3ca897d3da7671f1f167acbd9b1de231b4987 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 13 Dec 2024 13:03:03 -0800 Subject: [PATCH 19/24] Defer import of thermo read functionality --- metaMS/lcms_lipidomics_workflow.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index c72052f..7511d31 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -5,7 +5,6 @@ from multiprocessing import Pool from corems.mass_spectra.input.mzml import MZMLSpectraParser -from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader @dataclass class LipidomicsWorkflowParameters: @@ -55,7 +54,7 @@ def instantiate_lcms_obj(file_in): # Instantiate parser based on binary file type if ".raw" in str(file_in): #TODO KRH: Add real functionality here - pass + from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader #parser = ImportMassSpectraThermoMSFileReader(file_in) if ".mzML" in str(file_in): From 6e884c34fdba9061d95382b28cf5c67aaba7575a Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 13 Dec 2024 13:12:12 -0800 Subject: [PATCH 20/24] Refactor to interface with MetabRef database --- configuration/gcms_corems.toml | 2 +- configuration/gcms_metams.toml | 1 - metaMS/cli.py | 3 --- metaMS/gcmsWorkflow.py | 19 ++++--------------- wdl/metaMS_gcms.wdl | 2 -- wdl/metams_input_gcms.json | 1 - 6 files changed, 5 insertions(+), 23 deletions(-) diff --git a/configuration/gcms_corems.toml b/configuration/gcms_corems.toml index 0c6d1ed..1601bb9 100644 --- a/configuration/gcms_corems.toml +++ b/configuration/gcms_corems.toml @@ -1,5 +1,5 @@ [MolecularSearch] -url_database = "sqlite:///db/MetabRef_Library_EILowRes_20240816.db" +url_database = "sqlite:///db/MetabRef_Library_EILowRes_20240816.db" # This is no longer needed ri_search_range = 35.0 rt_search_range = 1.0 correlation_threshold = 0.5 diff --git a/configuration/gcms_metams.toml b/configuration/gcms_metams.toml index 307b35e..aa71dfa 100644 --- a/configuration/gcms_metams.toml +++ b/configuration/gcms_metams.toml @@ -1,5 +1,4 @@ file_paths = [ "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf"] -calibration_reference_path = "sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db" calibration_file_path = "data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" corems_toml_path = "configuration/gcms_corems.toml" nmdc_metadata_path = "configuration/nmdc_metadata.json" diff --git a/metaMS/cli.py b/metaMS/cli.py index ec78cd8..a6112df 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -25,7 +25,6 @@ def cli(): @cli.command() @click.argument("file_paths", required=True, type=str) -@click.argument("calibration_reference_path", required=True, type=str) @click.argument("calibration_file_path", required=True, type=str) @click.argument("output_directory", required=True, type=str) @click.argument("output_filename", required=True, type=str) @@ -35,7 +34,6 @@ def cli(): @click.option("--jobs", "-j", default=4, help="'cpu's'") def run_gcms_wdl_workflow( file_paths, - calibration_reference_path, calibration_file_path, output_directory, output_filename, @@ -53,7 +51,6 @@ def run_gcms_wdl_workflow( click.echo("Running gcms workflow") run_gcms_metabolomics_workflow_wdl( file_paths, - calibration_reference_path, calibration_file_path, output_directory, output_filename, diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index 6bbcdcd..62c0b27 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -9,6 +9,7 @@ from corems.mass_spectra.input.andiNetCDF import ReadAndiNetCDF from corems.molecular_id.factory.EI_SQL import EI_LowRes_SQLite from corems.molecular_id.search.compoundSearch import LowResMassSpectralMatch +from corems.molecular_id.search.database_interfaces import MetabRefGCInterface @dataclass @@ -81,8 +82,6 @@ def run_gcms_metabolomics_workflow_wdl( ---------- file_paths : tuple(str) Paths to files to process. - calibration_reference_path : str - FAMEs retention index calibration reference filepath. calibration_file_path : str FAMEs retention index calibration filepath. output_directory : str @@ -105,7 +104,6 @@ def run_gcms_metabolomics_workflow_wdl( # Store workflow parameters workflow_params = WorkflowParameters() workflow_params.file_paths = file_paths.split(",") - workflow_params.calibration_reference_path = calibration_reference_path workflow_params.calibration_file_path = calibration_file_path workflow_params.output_directory = output_directory workflow_params.output_filename = output_filename @@ -130,9 +128,7 @@ def run_gcms_metabolomics_workflow_wdl( ) # Load FAMEs calibration reference - fames_ref_sql = EI_LowRes_SQLite( - url=workflow_params.calibration_reference_path - ) + fames_ref_sql = MetabRefGCInterface().get_fames(format="sql") # Compute RT:RI pairs rt_ri_pairs = get_rt_ri_pairs(gcms_cal_obj, sql_obj=fames_ref_sql) @@ -257,9 +253,7 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): ) # Load FAMEs calibration reference - fames_ref_sql = EI_LowRes_SQLite( - url=workflow_params.calibration_reference_path - ) + fames_ref_sql = MetabRefGCInterface().get_fames(format="sql") # Compute RT:RI pairs rt_ri_pairs = get_rt_ri_pairs(gcms_cal_obj, sql_obj=fames_ref_sql) @@ -360,9 +354,6 @@ def workflow_worker(args): # Unpack arguments file_path, ref_dict, corems_params_file, cal_file_path = args - # Load CoreMS parameters - corems_parameters = load_corems_parameters(corems_params_file) - # Load data gcms = get_gcms(file_path, corems_params_file) @@ -370,9 +361,7 @@ def workflow_worker(args): gcms.calibrate_ri(ref_dict, cal_file_path) # Load reference database - ref_db_sql = EI_LowRes_SQLite( - url=corems_parameters["MolecularSearch"]["url_database"] - ) + ref_db_sql = MetabRefGCInterface().get_library(format="sql") # Perform search lowResSearch = LowResMassSpectralMatch(gcms, sql_obj=ref_db_sql) diff --git a/wdl/metaMS_gcms.wdl b/wdl/metaMS_gcms.wdl index 75339cc..ba37011 100644 --- a/wdl/metaMS_gcms.wdl +++ b/wdl/metaMS_gcms.wdl @@ -13,7 +13,6 @@ workflow gcmsMetabolomics { task runMetaMSGCMS { input { Array[File] file_paths - File calibration_reference_path File calibration_file_path String output_directory String output_filename @@ -26,7 +25,6 @@ task runMetaMSGCMS { command { metaMS run-gcms-wdl-workflow \ ${sep=',' file_paths} \ - ${calibration_reference_path} \ ${calibration_file_path} \ ${output_directory} \ ${output_filename} \ diff --git a/wdl/metams_input_gcms.json b/wdl/metams_input_gcms.json index ac155ab..2a26ace 100644 --- a/wdl/metams_input_gcms.json +++ b/wdl/metams_input_gcms.json @@ -3,7 +3,6 @@ "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" ], -"gcmsMetabolomics.runMetaMS.calibration_reference_path": "sqlite:///db/MetabRef_FAMEs_EILowRes_20240816.db", "gcmsMetabolomics.runMetaMS.calibration_file_path": "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "gcmsMetabolomics.runMetaMS.output_directory": "test_output", "gcmsMetabolomics.runMetaMS.output_filename": "test_dataset", From a693d1b2dda14bf251a0a82fdc27475b8da614a6 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 13 Dec 2024 13:26:13 -0800 Subject: [PATCH 21/24] Add functionality to authenticate via MetabRef API token --- configuration/gcms_metams.toml | 1 + metaMS/cli.py | 3 +++ metaMS/gcmsWorkflow.py | 14 +++++++++++--- wdl/metaMS_gcms.wdl | 2 ++ 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/configuration/gcms_metams.toml b/configuration/gcms_metams.toml index aa71dfa..eed996c 100644 --- a/configuration/gcms_metams.toml +++ b/configuration/gcms_metams.toml @@ -5,3 +5,4 @@ nmdc_metadata_path = "configuration/nmdc_metadata.json" output_directory = "output" output_filename = "output" output_type = "csv" +metabref_token_path = "configuration/metabref.token" diff --git a/metaMS/cli.py b/metaMS/cli.py index a6112df..e7d50cc 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -31,6 +31,7 @@ def cli(): @click.argument("output_type", required=True, type=str) @click.argument("corems_toml_path", required=True, type=str) @click.argument("nmdc_metadata_path", required=True, type=str) +@click.argument("metabref_token_path", required=True, type=str) @click.option("--jobs", "-j", default=4, help="'cpu's'") def run_gcms_wdl_workflow( file_paths, @@ -40,6 +41,7 @@ def run_gcms_wdl_workflow( output_type, corems_toml_path, nmdc_metadata_path, + metabref_token_path, jobs, ): """Run the GCMS workflow\n @@ -56,6 +58,7 @@ def run_gcms_wdl_workflow( output_filename, output_type, corems_toml_path, + metabref_token_path, jobs, ) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index 62c0b27..3cf8ca0 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -54,6 +54,9 @@ class WorkflowParameters: output_filename: str = "data/..." output_type: str = "csv" + # Token + metabref_token_path: str = "configuration/..." + def worker(args): """ @@ -72,6 +75,7 @@ def run_gcms_metabolomics_workflow_wdl( output_filename, output_type, corems_toml_path, + metabref_token_path, jobs, db_path=None, ): @@ -92,6 +96,8 @@ def run_gcms_metabolomics_workflow_wdl( Output extension. corems_toml_path : str CoreMS configuration. + metabref_token_path : str + Token to authenticate MetabRef database access. jobs : int Number of concurrent jobs. [unused] db_path : str @@ -109,6 +115,7 @@ def run_gcms_metabolomics_workflow_wdl( workflow_params.output_filename = output_filename workflow_params.output_type = output_type workflow_params.corems_toml_path = corems_toml_path + workflow_params.metabref_token_path = metabref_token_path # Load CoreMS settings click.echo("Loading CoreMS settings from %s" % workflow_params.corems_toml_path) @@ -128,6 +135,7 @@ def run_gcms_metabolomics_workflow_wdl( ) # Load FAMEs calibration reference + MetabRefGCInterface().set_token(workflow_params.metabref_token_path) fames_ref_sql = MetabRefGCInterface().get_fames(format="sql") # Compute RT:RI pairs @@ -183,9 +191,8 @@ def run_nmdc_metabolomics_workflow(workflow_params_file, jobs): ) # Load FAMEs calibration reference - fames_ref_sql = EI_LowRes_SQLite( - url=workflow_params.calibration_reference_path - ) + MetabRefGCInterface().set_token(workflow_params.metabref_token_path) + fames_ref_sql = MetabRefGCInterface().get_fames(format='sql') # Compute RT:RI pairs rt_ri_pairs = get_rt_ri_pairs(gcms_cal_obj, sql_obj=fames_ref_sql) @@ -253,6 +260,7 @@ def run_gcms_metabolomics_workflow(workflow_params_file, jobs): ) # Load FAMEs calibration reference + MetabRefGCInterface().set_token(workflow_params.metabref_token_path) fames_ref_sql = MetabRefGCInterface().get_fames(format="sql") # Compute RT:RI pairs diff --git a/wdl/metaMS_gcms.wdl b/wdl/metaMS_gcms.wdl index ba37011..b7fbd12 100644 --- a/wdl/metaMS_gcms.wdl +++ b/wdl/metaMS_gcms.wdl @@ -19,6 +19,7 @@ task runMetaMSGCMS { String output_type File corems_toml_path File nmdc_metadata_path + File metabref_token_path Int jobs_count } @@ -31,6 +32,7 @@ task runMetaMSGCMS { ${output_type} \ ${corems_toml_path} \ ${nmdc_metadata_path} \ + ${metabref_token_path} \ --jobs ${jobs_count} } From 975e8663310d6977de3b349884811a78057b3183 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 13 Dec 2024 13:27:13 -0800 Subject: [PATCH 22/24] Add token path entry --- wdl/metams_input_gcms.json | 1 + 1 file changed, 1 insertion(+) diff --git a/wdl/metams_input_gcms.json b/wdl/metams_input_gcms.json index 2a26ace..5c3ced9 100644 --- a/wdl/metams_input_gcms.json +++ b/wdl/metams_input_gcms.json @@ -9,5 +9,6 @@ "gcmsMetabolomics.runMetaMS.output_type": "csv", "gcmsMetabolomics.runMetaMS.corems_toml_path": "./configuration/gcms_corems.toml", "gcmsMetabolomics.runMetaMS.nmdc_metadata_path": "./configuration/nmdc_metadata.json", +"gcmsMetabolomics.runMetaMS.metabref_token_path": "./configuration/metabref.token", "gcmsMetabolomics.runMetaMS.jobs_count": 4 } \ No newline at end of file From 64ce234e7f0efa8506a6069f672b25acd0617d45 Mon Sep 17 00:00:00 2001 From: Sean Colby Date: Fri, 13 Dec 2024 13:34:05 -0800 Subject: [PATCH 23/24] Remove explicit database URL due to MetabRef connectivity --- configuration/gcms_corems.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/configuration/gcms_corems.toml b/configuration/gcms_corems.toml index 1601bb9..8487e66 100644 --- a/configuration/gcms_corems.toml +++ b/configuration/gcms_corems.toml @@ -1,5 +1,4 @@ [MolecularSearch] -url_database = "sqlite:///db/MetabRef_Library_EILowRes_20240816.db" # This is no longer needed ri_search_range = 35.0 rt_search_range = 1.0 correlation_threshold = 0.5 From 10aaa968061eb832d5a2e16f6ba3c38925a9cc79 Mon Sep 17 00:00:00 2001 From: Katherine Heal Date: Mon, 16 Dec 2024 14:25:37 -0800 Subject: [PATCH 24/24] Modify GCMS wdl workflow parameters and remove extra parameter from gcms function --- metaMS/gcmsWorkflow.py | 1 - wdl/metams_input_gcms.json | 18 +++++++++--------- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/metaMS/gcmsWorkflow.py b/metaMS/gcmsWorkflow.py index 3cf8ca0..49c8d7e 100644 --- a/metaMS/gcmsWorkflow.py +++ b/metaMS/gcmsWorkflow.py @@ -69,7 +69,6 @@ def worker(args): def run_gcms_metabolomics_workflow_wdl( file_paths, - calibration_reference_path, calibration_file_path, output_directory, output_filename, diff --git a/wdl/metams_input_gcms.json b/wdl/metams_input_gcms.json index 5c3ced9..cd6a3f9 100644 --- a/wdl/metams_input_gcms.json +++ b/wdl/metams_input_gcms.json @@ -1,14 +1,14 @@ { - "gcmsMetabolomics.runMetaMS.file_paths": [ + "gcmsMetabolomics.runMetaMSGCMS.file_paths": [ "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf" ], -"gcmsMetabolomics.runMetaMS.calibration_file_path": "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", -"gcmsMetabolomics.runMetaMS.output_directory": "test_output", -"gcmsMetabolomics.runMetaMS.output_filename": "test_dataset", -"gcmsMetabolomics.runMetaMS.output_type": "csv", -"gcmsMetabolomics.runMetaMS.corems_toml_path": "./configuration/gcms_corems.toml", -"gcmsMetabolomics.runMetaMS.nmdc_metadata_path": "./configuration/nmdc_metadata.json", -"gcmsMetabolomics.runMetaMS.metabref_token_path": "./configuration/metabref.token", -"gcmsMetabolomics.runMetaMS.jobs_count": 4 +"gcmsMetabolomics.runMetaMSGCMS.calibration_file_path": "./data/raw_data/GCMS_FAMES_01_GCMS-01_20191023.cdf", +"gcmsMetabolomics.runMetaMSGCMS.output_directory": "test_output", +"gcmsMetabolomics.runMetaMSGCMS.output_filename": "test_dataset", +"gcmsMetabolomics.runMetaMSGCMS.output_type": "csv", +"gcmsMetabolomics.runMetaMSGCMS.corems_toml_path": "./configuration/gcms_corems.toml", +"gcmsMetabolomics.runMetaMSGCMS.nmdc_metadata_path": "./configuration/nmdc_metadata.json", +"gcmsMetabolomics.runMetaMSGCMS.metabref_token_path": "./configuration/metabref.token", +"gcmsMetabolomics.runMetaMSGCMS.jobs_count": 4 } \ No newline at end of file