diff --git a/.gitignore b/.gitignore index 13134e4..78f7f33 100644 --- a/.gitignore +++ b/.gitignore @@ -35,4 +35,10 @@ wheels/ share/python-wheels/ *.egg-info/ .installed.cfg -*.egg \ No newline at end of file +*.egg + +# Raw thermo data +*.raw + +# Metabref token +*.token \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 481ad02..c3277f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,5 +21,12 @@ COPY metaMS/ /metams/metaMS/ COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/ COPY db/ /metams/db/ +#TODO KRH: Remove this section once the CoreMS package is available on PyPI and installable via the requirements.txt file +# Copy the CoreMS tar.gz file into the Docker image +COPY CoreMS-3.2.0.tar.gz /metams/ + +# Install the CoreMS package from the tar.gz file +RUN pip install /metams/CoreMS-3.2.0.tar.gz + # Install the MetaMS package in editable mode RUN pip3 install --editable . diff --git a/Dockerfile_test.txt b/Dockerfile_test.txt new file mode 100644 index 0000000..00369b8 --- /dev/null +++ b/Dockerfile_test.txt @@ -0,0 +1,26 @@ +FROM jcarr87/corems-base-py3.10 + +# set environment variables +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV PYTHONNET_RUNTIME=coreclr + +# set working directory +WORKDIR /metams + +# Set up the directory structure +COPY metaMS/ /metams/metaMS/ +COPY README.md disclaimer.txt Makefile requirements.txt setup.py /metams/ +COPY db/ /metams/db/ + +# install dependencies +RUN pip install --upgrade pip +RUN apt-get update && apt-get install -y libgomp1 + +# Install the requirements +#RUN pip install pythonnet +RUN pip install -r requirements.txt + +# Install the MetaMS package in editable mode +RUN pip install --editable . + diff --git a/Makefile b/Makefile index 7cb6bc0..9dea2ed 100644 --- a/Makefile +++ b/Makefile @@ -73,13 +73,23 @@ docker-build: docker build -t microbiomedata/metams:latest . +docker-build-local: + docker build -t local-metams:latest . + docker-run: @echo $(data_dir) @echo $(config_dir) @docker run -v $(data_dir):/metams/data -v $(config_dir):/metams/configuration microbiomedata/metams:latest metaMS run-gcms-workflow /metams/configuration/metams.toml -wdl-run : - miniwdl run wdl/metaMS.wdl -i wdl/metams_input.json --verbose --no-cache --copy-input-files +wdl-run-gcms : + + miniwdl run wdl/metaMS_gcms.wdl -i wdl/metams_input_gcms.json --verbose --no-cache --copy-input-files + +wdl-run-lipid : +#TODO KRH: remove the docker-build-local when the docker image is available in dockerhub and +# update the docker image in the wdl file. Good to rebuild for each run while in development + make docker-build-local + miniwdl run wdl/metaMS_lcmslipidomics.wdl -i wdl/metams_input_lipidomics.json --verbose --no-cache --copy-input-files convert_lipid_rst_to_md: # convert the lipid documentation from rst to md diff --git a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml index 222d6d0..9620f12 100644 --- a/configuration/lipid_configs/emsl_lipidomics_corems_params.toml +++ b/configuration/lipid_configs/emsl_lipidomics_corems_params.toml @@ -24,21 +24,63 @@ ms2_dda_mz_tolerance = 0.05 ms2_min_fe_score = 0.3 search_as_lipids = true include_fragment_types = true -ph_inten_min_rel = 0.05 -ph_persis_min_rel = 0.005 -ph_smooth_it = 0 export_profile_spectra = false -export_eics = true +export_eics = false export_unprocessed_ms1 = false +verbose_processing = false +ph_inten_min_rel = 0.01 +ph_persis_min_rel = 0.01 +ph_smooth_it = 0 -[MassSpectrum] -noise_threshold_method = "relative_abundance" +[mass_spectrum.ms1.molecular_search] +use_isotopologue_filter = false +isotopologue_filter_threshold = 33.0 +isotopologue_filter_atoms = [ "Cl", "Br",] +use_runtime_kendrick_filter = false +use_min_peaks_filter = true +min_peaks_per_class = 15 +url_database = "" +db_jobs = 1 +db_chunk_size = 300 +ion_charge = -1 +min_hc_filter = 0.3 +max_hc_filter = 3.0 +min_oc_filter = 0.0 +max_oc_filter = 1.2 +min_op_filter = 2.0 +use_pah_line_rule = false +min_dbe = 0.0 +max_dbe = 40.0 +mz_error_score_weight = 0.6 +isotopologue_score_weight = 0.4 +adduct_atoms_neg = [ "Cl", "Br",] +adduct_atoms_pos = [ "Na", "K",] +score_methods = [ "S_P_lowest_error", "N_S_P_lowest_error", "lowest_error", "prob_score", "air_filter_error", "water_filter_error", "earth_filter_error",] +score_method = "prob_score" +output_min_score = 0.3 +output_score_method = "All Candidates" +isRadical = false +isProtonated = true +isAdduct = false +ion_types_excluded = [] +ionization_type = "ESI" +min_ppm_error = -5.0 +max_ppm_error = 5.0 +min_abun_error = -50.0 +max_abun_error = 50.0 +mz_error_range = 1.5 +error_method = "None" +mz_error_average = 0.0 +verbose_processing = false + +[mass_spectrum.ms1.mass_spectrum] +noise_threshold_method = "log" noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] noise_threshold_min_std = 6 noise_threshold_min_s2n = 4.0 -noise_threshold_min_relative_abundance = 1 +noise_threshold_min_relative_abundance = 0.01 noise_threshold_absolute_abundance = 1000000.0 -noise_threshold_log_nsigma = 6 +noise_threshold_log_nsigma = 8 noise_threshold_log_nsigma_corr_factor = 0.463 noise_threshold_log_nsigma_bins = 500 noise_min_mz = 0 @@ -51,9 +93,14 @@ calib_pol_order = 2 max_calib_ppm_error = 1.0 min_calib_ppm_error = -1.0 calib_sn_threshold = 2.0 +calibration_ref_match_method = "legacy" +calibration_ref_match_method_implemented = [ "legacy", "merged",] +calibration_ref_match_tolerance = 0.003 +calibration_ref_match_std_raw_error_limit = 1.5 do_calibration = true +verbose_processing = false -[MassSpecPeak] +[mass_spectrum.ms1.ms_peak] kendrick_rounding_method = "floor" implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",] peak_derivative_threshold = 0.0 @@ -62,26 +109,27 @@ min_peak_datapoints = 5.0 peak_max_prominence_percent = 0.1 peak_height_max_percent = 10.0 legacy_resolving_power = false +legacy_centroid_polyfit = false -[MS1MolecularSearch] +[mass_spectrum.ms2.molecular_search] use_isotopologue_filter = false isotopologue_filter_threshold = 33.0 isotopologue_filter_atoms = [ "Cl", "Br",] use_runtime_kendrick_filter = false use_min_peaks_filter = true min_peaks_per_class = 15 -url_database = "" +url_database = "postgresql+psycopg2://coremsappdb:coremsapppnnl@molformdb:5432/coremsapp" db_jobs = 3 db_chunk_size = 300 ion_charge = -1 min_hc_filter = 0.3 max_hc_filter = 3.0 min_oc_filter = 0.0 -max_oc_filter = 5 +max_oc_filter = 1.2 min_op_filter = 2.0 use_pah_line_rule = false -min_dbe = 0 -max_dbe = 50 +min_dbe = 0.0 +max_dbe = 40.0 mz_error_score_weight = 0.6 isotopologue_score_weight = 0.4 adduct_atoms_neg = [ "Cl", "Br",] @@ -93,17 +141,63 @@ output_score_method = "All Candidates" isRadical = false isProtonated = true isAdduct = false -ion_types_excluded = [] +ion_types_excluded = [ "[M+HCOO]-",] ionization_type = "ESI" -min_ppm_error = -5 -max_ppm_error = 5 +min_ppm_error = -10.0 +max_ppm_error = 10.0 min_abun_error = -100.0 max_abun_error = 100.0 mz_error_range = 1.5 error_method = "None" mz_error_average = 0.0 -[MS2MolecularSearch] +[mass_spectrum.ms2.transient] +implemented_apodization_function = [ "Hamming", "Hanning", "Blackman", "Full-Sine", "Half-Sine", "Kaiser", "Half-Kaiser",] +apodization_method = "Hanning" +number_of_truncations = 0 +number_of_zero_fills = 1 +next_power_of_two = false +kaiser_beta = 8.6 + +[mass_spectrum.ms2.mass_spectrum] +noise_threshold_method = "relative_abundance" +noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] +noise_threshold_min_std = 6 +noise_threshold_min_s2n = 4.0 +noise_threshold_min_relative_abundance = 0.01 +noise_threshold_absolute_abundance = 1000000.0 +noise_threshold_log_nsigma = 6 +noise_threshold_log_nsigma_corr_factor = 0.463 +noise_threshold_log_nsigma_bins = 500 +noise_min_mz = 0.0 +noise_max_mz = inf +min_picking_mz = 0.0 +max_picking_mz = inf +picking_point_extrapolate = 3 +calib_minimize_method = "Powell" +calib_pol_order = 2 +max_calib_ppm_error = 1.0 +min_calib_ppm_error = -1.0 +calib_sn_threshold = 2.0 +calibration_ref_match_method = "legacy" +calibration_ref_match_method_implemented = [ "legacy", "merged",] +calibration_ref_match_tolerance = 0.003 +calibration_ref_match_std_raw_error_limit = 1.5 +do_calibration = true +verbose_processing = false + +[mass_spectrum.ms2.ms_peak] +kendrick_rounding_method = "floor" +implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",] +peak_derivative_threshold = 0.0 +peak_min_prominence_percent = 0.1 +min_peak_datapoints = 5.0 +peak_max_prominence_percent = 0.1 +peak_height_max_percent = 10.0 +legacy_resolving_power = false +legacy_centroid_polyfit = false + +[mass_spectrum.ms2_cid.molecular_search] use_isotopologue_filter = false isotopologue_filter_threshold = 33.0 isotopologue_filter_atoms = [ "Cl", "Br",] @@ -136,28 +230,170 @@ isAdduct = false ion_types_excluded = [ "[M+HCOO]-",] ionization_type = "ESI" min_ppm_error = -10.0 -max_ppm_error = 10.0 +max_ppm_error = 200 min_abun_error = -100.0 max_abun_error = 100.0 mz_error_range = 1.5 error_method = "None" mz_error_average = 0.0 -[MassSpecPeak.kendrick_base] +[mass_spectrum.ms2_cid.transient] +implemented_apodization_function = [ "Hamming", "Hanning", "Blackman", "Full-Sine", "Half-Sine", "Kaiser", "Half-Kaiser",] +apodization_method = "Hanning" +number_of_truncations = 0 +number_of_zero_fills = 1 +next_power_of_two = false +kaiser_beta = 8.6 + +[mass_spectrum.ms2_cid.mass_spectrum] +noise_threshold_method = "relative_abundance" +noise_threshold_methods_implemented = [ "minima", "signal_noise", "relative_abundance", "absolute_abundance", "log",] +noise_threshold_min_std = 6 +noise_threshold_min_s2n = 4.0 +noise_threshold_min_relative_abundance = 0.01 +noise_threshold_absolute_abundance = 1000000.0 +noise_threshold_log_nsigma = 6 +noise_threshold_log_nsigma_corr_factor = 0.463 +noise_threshold_log_nsigma_bins = 500 +noise_min_mz = 0.0 +noise_max_mz = inf +min_picking_mz = 0.0 +max_picking_mz = inf +picking_point_extrapolate = 3 +calib_minimize_method = "Powell" +calib_pol_order = 2 +max_calib_ppm_error = 1.0 +min_calib_ppm_error = -1.0 +calib_sn_threshold = 2.0 +calibration_ref_match_method = "legacy" +calibration_ref_match_method_implemented = [ "legacy", "merged",] +calibration_ref_match_tolerance = 0.003 +calibration_ref_match_std_raw_error_limit = 1.5 +do_calibration = true +verbose_processing = false + +[mass_spectrum.ms2_cid.ms_peak] +kendrick_rounding_method = "floor" +implemented_kendrick_rounding_methods = [ "floor", "ceil", "round",] +peak_derivative_threshold = 0.0 +peak_min_prominence_percent = 0.1 +min_peak_datapoints = 5.0 +peak_max_prominence_percent = 0.1 +peak_height_max_percent = 10.0 +legacy_resolving_power = false +legacy_centroid_polyfit = false + +[mass_spectrum.ms1.molecular_search.usedAtoms] +C = [ 11, 70,] +H = [ 19, 150,] +N = [ 0, 2,] +P = [ 0, 1,] +O = [ 1, 23,] +S = [ 0, 1,] +Na = [ 0, 1,] +Cl = [ 0, 1,] + +[mass_spectrum.ms1.molecular_search.used_atom_valences] +C = 4 +13C = 4 +N = 3 +O = 2 +S = 2 +H = 1 +F = 1 +Cl = 1 +Br = 1 +I = 1 +At = 1 +Li = 1 +Na = 1 +K = 1 +Rb = 1 +Cs = 1 +Fr = 1 +B = 4 +In = 3 +Al = 3 +P = 3 +Ga = 3 +Mg = 2 +Be = 2 +Ca = 2 +Sr = 2 +Ba = 2 +Ra = 2 +V = 5 +Fe = 3 +Si = 4 +Sc = 3 +Ti = 4 +Cr = 1 +Mn = 1 +Co = 1 +Ni = 1 +Cu = 2 +Zn = 2 +Ge = 4 +As = 5 +Se = 6 +Y = 3 +Zr = 4 +Nb = 5 +Mo = 6 +Tc = 7 +Ru = 8 +Rh = 6 +Pd = 4 +Ag = 0 +Cd = 2 +Sn = 4 +Sb = 5 +Te = 6 +La = 3 +Hf = 4 +Ta = 5 +W = 6 +Re = 4 +Os = 4 +Ir = 4 +Pt = 4 +Au = 3 +Hg = 1 +Tl = 3 +Pb = 4 +Bi = 3 +Po = 2 +Ac = 3 + +[mass_spectrum.ms1.ms_peak.kendrick_base] C = 1 H = 2 -[MS1MolecularSearch.usedAtoms] -C = [ 10, 100,] +[mass_spectrum.ms1.data_input.header_translate] +"m/z" = "m/z" +mOz = "m/z" +Mass = "m/z" +"Resolving Power" = "Resolving Power" +"Res." = "Resolving Power" +resolution = "Resolving Power" +Intensity = "Peak Height" +"Peak Height" = "Peak Height" +I = "Peak Height" +Abundance = "Peak Height" +abs_abu = "Peak Height" +"Signal/Noise" = "S/N" +"S/N" = "S/N" +sn = "S/N" + +[mass_spectrum.ms2.molecular_search.usedAtoms] +C = [ 10, 30,] H = [ 18, 200,] +O = [ 1, 23,] N = [ 0, 3,] P = [ 0, 1,] -O = [ 1, 23,] S = [ 0, 1,] -Na = [ 0, 1,] -Cl = [ 0, 1,] -[MS1MolecularSearch.used_atom_valences] +[mass_spectrum.ms2.molecular_search.used_atom_valences] C = 4 13C = 4 N = 3 @@ -229,11 +465,35 @@ Bi = 3 Po = 2 Ac = 3 -[MS2MolecularSearch.usedAtoms] -C = [ 1, 100,] -H = [ 1, 200,] +[mass_spectrum.ms2.ms_peak.kendrick_base] +C = 1 +H = 2 + +[mass_spectrum.ms2.data_input.header_translate] +"m/z" = "m/z" +mOz = "m/z" +Mass = "m/z" +"Resolving Power" = "Resolving Power" +"Res." = "Resolving Power" +resolution = "Resolving Power" +Intensity = "Peak Height" +"Peak Height" = "Peak Height" +I = "Peak Height" +Abundance = "Peak Height" +abs_abu = "Peak Height" +"Signal/Noise" = "S/N" +"S/N" = "S/N" +sn = "S/N" + +[mass_spectrum.ms2_cid.molecular_search.usedAtoms] +C = [ 10, 30,] +H = [ 18, 200,] +O = [ 1, 23,] +N = [ 0, 3,] +P = [ 0, 1,] +S = [ 0, 1,] -[MS2MolecularSearch.used_atom_valences] +[mass_spectrum.ms2_cid.molecular_search.used_atom_valences] C = 4 13C = 4 N = 3 @@ -304,3 +564,23 @@ Pb = 4 Bi = 3 Po = 2 Ac = 3 + +[mass_spectrum.ms2_cid.ms_peak.kendrick_base] +C = 1 +H = 2 + +[mass_spectrum.ms2_cid.data_input.header_translate] +"m/z" = "m/z" +mOz = "m/z" +Mass = "m/z" +"Resolving Power" = "Resolving Power" +"Res." = "Resolving Power" +resolution = "Resolving Power" +Intensity = "Peak Height" +"Peak Height" = "Peak Height" +I = "Peak Height" +Abundance = "Peak Height" +abs_abu = "Peak Height" +"Signal/Noise" = "S/N" +"S/N" = "S/N" +sn = "S/N" diff --git a/configuration/lipidomics_metams.toml b/configuration/lipidomics_metams.toml index 0b4e426..c7957f6 100644 --- a/configuration/lipidomics_metams.toml +++ b/configuration/lipidomics_metams.toml @@ -1,6 +1,9 @@ -directory = "data/raw_data/lipid_data" -output_directory = "output" -corems_toml_path = "configuration/lipid_configs/emsl_lipidomics_corems_params.toml" -metabref_token_path = "metabref.token" -scan_translator_path = "configuration/lipid_configs/emsl_lipidomics_scan_translator.toml" -cores = 1 \ No newline at end of file +file_paths = [ + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_POS_23Jan18_Brandi-WCSH5801.raw", + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_NEG_25Jan18_Brandi-WCSH5801.raw" +] +output_directory = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/_blanchard_11_8ws97026/processed_20241218" +corems_toml_path = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_corems_params.toml" +db_location = "/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite" +scan_translator_path = "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_scan_translator.toml" +cores = 4 \ No newline at end of file diff --git a/metaMS/cli.py b/metaMS/cli.py index e7d50cc..7bf3dfa 100644 --- a/metaMS/cli.py +++ b/metaMS/cli.py @@ -123,23 +123,6 @@ def dump_lipidomics_toml_template(toml_file_name): toml.dump(LipidomicsWorkflowParameters().__dict__, workflow_param) -@cli.command(name="dump-lipidomics-corems-toml-template") -@click.argument("toml_file_name", required=True, type=str) -def dump_lipidomics_corems_toml_template(toml_file_name): - """ - Writes a toml file with the CoreMS parameters to be used in the lipidomics workflow - - Parameters - ---------- - toml_file_name : str - The name of the toml file to write the parameters to - """ - path_obj = Path(toml_file_name).with_suffix(".toml") - print("dumping lipidomics corems toml template") - pass - # TODO KRH: add call for dumping lipidomics corems toml template from corems once we can import it - - @cli.command(name="run-lipidomics-workflow") @click.option( "-p", @@ -150,10 +133,10 @@ def dump_lipidomics_corems_toml_template(toml_file_name): ) @click.option( "-i", - "--directory", + "--file_paths", required=False, type=str, - help="The directory where the data is stored, all files in the directory will be processed", + help="The path to the directory with the input files", ) @click.option( "-o", @@ -163,7 +146,14 @@ def dump_lipidomics_corems_toml_template(toml_file_name): help="The directory where the output files will be stored", ) @click.option( - "-t", "--token_path", required=False, type=str, help="The path to the metabref token" + "-c", + "--corems_params", + required=False, + type=str, + help="The path corems parameters toml file", +) +@click.option( + "-d", "--db_location", required=False, type=str, help="The path to the local database" ) @click.option( "-s", "--scan_translator_path", required=False, type=str, help="The path to the scan translator file" @@ -171,17 +161,36 @@ def dump_lipidomics_corems_toml_template(toml_file_name): @click.option( "-j", "--cores", required=False, type=int, help="'cpu's to use for processing" ) -def run_lipidomics_workflow(paramaters_file, directory, output_directory, token_path, scan_translator_path, cores): +def run_lipidomics_workflow( + paramaters_file, + file_paths, + output_directory, + corems_params, + db_location, + scan_translator_path, + cores + ): """Run the lipidomics workflow Parameters ---------- paramaters_file : str The path to the toml file with the lipidomics workflow parameters - This file can be generated using the dump-lipidomics-toml-template command + file_paths : str + The paths to the input files, separated by commas as one string + output_directory : str + The directory where the output files will be stored + corems_params : str + The path corems parameters toml file + db_location : str + The path to the sqlite database for lipid spectra searching + scan_translator_path : str + The path to the scan translator file + cores : int + The number of cores to use for processing """ if paramaters_file is not None: - if cores is not None or directory is not None: + if cores is not None or file_paths is not None: click.echo("Using parameters file, ignoring other parameters") run_lcms_lipidomics_workflow( lipidomics_workflow_paramaters_file=paramaters_file @@ -189,18 +198,26 @@ def run_lipidomics_workflow(paramaters_file, directory, output_directory, token_ else: if cores is None: cores = 1 - if directory is None: - click.echo("No directory provided, no data to process") + if file_paths is None: + click.echo("No file paths provided, no data to process") return + if corems_params is None: + click.echo("No corems parameters provided") + if scan_translator_path is None: + click.echo("No scan translator provided") if output_directory is None: click.echo( "Must provide an output directory if not using a parameters file" ) return + if db_location is None: + click.echo("No database path provided") + return run_lcms_lipidomics_workflow( - directory=directory, output_directory=output_directory, cores=cores + file_paths=file_paths, + output_directory=output_directory, + corems_toml_path=corems_params, + db_location=db_location, + scan_translator_path=scan_translator_path, + cores=cores, ) - click.echo("Running lipidomics workflow") - # test call: - # MetaMS run-lipidomics-workflow -p configuration/lipidomics_metams.toml - # miniwdl run wdl/metaMS_lipidomics.wdl -i wdl/metams_input_lipidomics.json diff --git a/metaMS/lcms_lipidomics_workflow.py b/metaMS/lcms_lipidomics_workflow.py index 7511d31..bf93184 100644 --- a/metaMS/lcms_lipidomics_workflow.py +++ b/metaMS/lcms_lipidomics_workflow.py @@ -1,10 +1,28 @@ from dataclasses import dataclass import toml from pathlib import Path -import datetime from multiprocessing import Pool +import click +import warnings +import pandas as pd +import gc from corems.mass_spectra.input.mzml import MZMLSpectraParser +from corems.mass_spectra.input.corems_hdf5 import ReadCoreMSHDFMassSpectra +from corems.mass_spectra.output.export import LipidomicsExport +from corems.molecular_id.search.molecularFormulaSearch import SearchMolecularFormulasLC +from corems.encapsulation.input.parameter_from_json import ( + load_and_set_toml_parameters_lcms, +) + +from metaMS.lipid_metadata_prepper import get_lipid_library, _to_flashentropy + +# Suppress specific warning from the corems.mass_spectrum.input.massList module +warnings.filterwarnings( + "ignore", + message="No isotopologue matched the formula_dict: {formula_dict}", + module="corems.mass_spectrum.input.massList" +) @dataclass class LipidomicsWorkflowParameters: @@ -19,23 +37,49 @@ class LipidomicsWorkflowParameters: The directory where the output files will be stored corems_toml_path : str The path to the corems configuration file - metabref_token_path : str - The path to the metabref token file - See https://metabref.emsl.pnnl.gov/api for more information for how to get a token + db_location : str + The path to the local sqlite database used for searching lipid ms2 spectra scan_translator_path : str The path to the scan translator file, optional cores : int The number of cores to use for processing, optional """ - - directory: str = "data/..." - output_directory: str = "output/..." - corems_toml_path: str = "configuration/lipidomics_corems.toml" - metabref_token_path: str = None + file_paths: tuple = ('data/...', 'data/...') + output_directory: str = "output" + corems_toml_path: str = None + db_location: str = None scan_translator_path: str = None cores: int = 1 +def check_lipidomics_workflow_params(lipid_workflow_params): + # Check that corems_toml_path exists + if not Path(lipid_workflow_params.corems_toml_path).exists(): + raise FileNotFoundError("Corems toml file not found, exiting workflow") + + # Check that scan_translator_path exists + if not Path(lipid_workflow_params.scan_translator_path).exists(): + raise FileNotFoundError("Scan translator file not found, exiting workflow") + + # Check that output_directory exists + if not Path(lipid_workflow_params.output_directory).exists(): + raise FileNotFoundError("Output directory not found, exiting workflow") + + # Check that file_paths exist + for file_path in lipid_workflow_params.file_paths: + if not Path(file_path).exists(): + raise FileNotFoundError(f"File path {file_path} not found, exiting workflow") + + # Check that all file_paths end in .raw or .mzML + for file_path in lipid_workflow_params.file_paths: + if ".raw" not in file_path and ".mzML" not in file_path: + raise ValueError(f"File path {file_path} is not a .raw or .mzML file, exiting workflow") + + # Check that db_location exists + if lipid_workflow_params.db_location is not None: + if not Path(lipid_workflow_params.db_location).exists(): + raise FileNotFoundError("Database location not found, exiting workflow") + def instantiate_lcms_obj(file_in): """Instantiate a corems LCMS object from a binary file. Pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory) @@ -53,31 +97,401 @@ def instantiate_lcms_obj(file_in): """ # Instantiate parser based on binary file type if ".raw" in str(file_in): - #TODO KRH: Add real functionality here from corems.mass_spectra.input.rawFileReader import ImportMassSpectraThermoMSFileReader - #parser = ImportMassSpectraThermoMSFileReader(file_in) + parser = ImportMassSpectraThermoMSFileReader(file_in) if ".mzML" in str(file_in): - #parser = MZMLSpectraParser(file_in) - pass + parser = MZMLSpectraParser(file_in) # Instantiate lc-ms data object using parser and pull in ms1 spectra into dataframe (without storing as MassSpectrum objects to save memory) - #myLCMSobj = parser.get_lcms_obj(spectra="ms1") - myLCMSobj = None + myLCMSobj = parser.get_lcms_obj(spectra="ms1") return myLCMSobj +def set_params_on_lcms_obj(myLCMSobj, params_toml): + """Set parameters on the LCMS object + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to set parameters on + params_toml : str or Path + Path to toml file with parameters + + Returns + ------- + None, sets parameters on the LCMS object + """ + # Load parameters from toml file + load_and_set_toml_parameters_lcms(myLCMSobj, params_toml) + + # If myLCMSobj is a positive mode, remove Cl from atoms used in molecular search + # This cuts down on the number of molecular formulas searched hugely + if myLCMSobj.polarity == "positive": + myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Cl") + elif myLCMSobj.polarity == "negative": + myLCMSobj.parameters.mass_spectrum["ms1"].molecular_search.usedAtoms.pop("Na") + +def load_scan_translator(scan_translator=None): + """Translate scans using a scan translator + + Parameters + ---------- + scan_translator : str or Path + Path to scan translator yaml file + + Returns + ------- + scan_dict : dict + Dict with keys as parameter keys and values as lists of scans + """ + # Convert the scan translator to a dictionary + if scan_translator is None: + scan_translator_dict = {"ms2": {"scan_filter": "", "resolution": "high"}} + else: + # Convert the scan translator to a dictionary + if isinstance(scan_translator, str): + scan_translator = Path(scan_translator) + # read in the scan translator from toml + with open(scan_translator, "r") as f: + scan_translator_dict = toml.load(f) + for param_key in scan_translator_dict.keys(): + if scan_translator_dict[param_key]["scan_filter"] == "": + scan_translator_dict[param_key]["scan_filter"] = None + return scan_translator_dict + +def check_scan_translator(myLCMSobj, scan_translator): + """Check if scan translator is provided and that it maps correctly to scans and parameters""" + scan_translator_dict = load_scan_translator(scan_translator) + # Check that the scan translator maps correctly to scans and parameters + scan_df = myLCMSobj.scan_df + scans_pulled_out = [] + for param_key in scan_translator_dict.keys(): + assert param_key in myLCMSobj.parameters.mass_spectrum.keys() + assert "scan_filter" in scan_translator_dict[param_key].keys() + assert "resolution" in scan_translator_dict[param_key].keys() + # Pull out scans that match the scan filter + scan_df_sub = scan_df[ + scan_df.scan_text.str.contains( + scan_translator_dict[param_key]["scan_filter"] + ) + ] + scans_pulled_out.extend(scan_df_sub.scan.tolist()) + if len(scan_df_sub) == 0: + raise ValueError( + "No scans pulled out by scan translator for parameter key: ", + param_key, + " and scan filter: ", + scan_translator_dict[param_key]["scan_filter"], + ) + + # Check that the scans pulled out by the scan translator are not overlapping and assert error if they are + if len(set(scans_pulled_out)) != len(scans_pulled_out): + raise ValueError("Overlapping scans pulled out by scan translator") + +def add_mass_features(myLCMSobj, scan_translator): + """Process ms1 spectra and perform molecular search + + This includes peak picking, adding and processing associated ms1 spectra, + integration of mass features, annotation of c13 mass features, deconvolution of ms1 mass features, + and adding of peak shape metrics of mass features to the mass feature dataframe. + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + scan_translator : str or Path + Path to scan translator yaml file + + Returns + ------- + None, processes the LCMS object + """ + # Process ms1 spectra + myLCMSobj.find_mass_features() + myLCMSobj.add_associated_ms1( + auto_process=True, use_parser=False, spectrum_mode="profile" + ) + myLCMSobj.integrate_mass_features(drop_if_fail=True) + myLCMSobj.find_c13_mass_features() + myLCMSobj.deconvolute_ms1_mass_features() + myLCMSobj.add_peak_metrics() + + # Add associated ms2 spectra to mass features + scan_dictionary = load_scan_translator(scan_translator=scan_translator) + for param_key in scan_dictionary.keys(): + scan_filter = scan_dictionary[param_key]["scan_filter"] + if scan_filter == "": + scan_filter = None + myLCMSobj.add_associated_ms2_dda( + spectrum_mode="centroid", ms_params_key=param_key, scan_filter=scan_filter + ) + +def molecular_formula_search(myLCMSobj): + """Perform molecular search on ms1 spectra + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + + Returns + ------- + None, processes the LCMS object + """ + # Perform a molecular search on all of the mass features + mol_form_search = SearchMolecularFormulasLC(myLCMSobj) + mol_form_search.run_mass_feature_search() + +def export_results(myLCMSobj, out_path, molecular_metadata=None, final=False): + """Export results to hdf5 and csv as a lipid report + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + out_path : str or Path + Path to output file + molecular_metadata : dict + Dict with molecular metadata + final : bool + Whether to export final results + + Returns + ------- + None, exports results to hdf5 and csv as a lipid report + """ + exporter = LipidomicsExport(out_path, myLCMSobj) + exporter.to_hdf(overwrite=True) + if final: + exporter.report_to_csv(molecular_metadata=molecular_metadata) + else: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + exporter.report_to_csv() + def run_lipid_sp_ms1(file_in, out_path, params_toml, scan_translator): - time_start = datetime.datetime.now() - myLCMSobj = instantiate_lcms_obj(file_in) - # TODO KRH: Add signal processing and ms1 molecular search here + myLCMSobj = instantiate_lcms_obj(file_in) + set_params_on_lcms_obj(myLCMSobj, params_toml) + check_scan_translator(myLCMSobj, scan_translator) + add_mass_features(myLCMSobj, scan_translator) + myLCMSobj.remove_unprocessed_data() + #Finally, perform molecular formula search on all ms1 spectra associated with mass features + molecular_formula_search(myLCMSobj) + export_results(myLCMSobj, out_path=out_path, final=False) + precursor_mz_list = list( + set( + [ + v.mz + for k, v in myLCMSobj.mass_features.items() + if len(v.ms2_scan_numbers) > 0 and v.isotopologue_type is None + ] + ) + ) + mz_dict = {myLCMSobj.polarity: precursor_mz_list} + return mz_dict + +def prep_metadata(mz_dicts, out_dir, db_location): + """Prepare metadata for ms2 spectral search + + Parameters + ---------- + mz_dicts : list of dicts + List of dicts with keys "positive" and "negative" and values of lists of precursor mzs + out_dir : Path + Path to output directory + db_location : str + Path to lipid database + + Returns + ------- + metadata : dict + Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively + + Notes + ------- + Also writes out files for the flash entropy search databases and molecular metadata + """ + metadata = { + "mzs": {"positive": None, "negative": None}, + "fe": {"positive": None, "negative": None}, + "molecular_metadata": {}, + } + for d in mz_dicts: + metadata["mzs"].update(d) + + print("Preparing negative lipid library") + + if metadata["mzs"]["negative"] is not None: + metabref_negative, lipidmetadata_negative = get_lipid_library( + db_location=db_location, + mz_list=metadata["mzs"]["negative"], + polarity="negative", + mz_tol_ppm=5, + format="flashentropy", + normalize=True, + fe_kwargs={ + "normalize_intensity": True, + "min_ms2_difference_in_da": 0.02, # for cleaning spectra + "max_ms2_tolerance_in_da": 0.01, # for setting search space + "max_indexed_mz": 3000, + "precursor_ions_removal_da": None, + "noise_threshold": 0, + }, + ) + metadata["fe"]["negative"] = metabref_negative + metadata["molecular_metadata"].update(lipidmetadata_negative) + fe_negative_df = pd.DataFrame.from_dict( + {k: v for k, v in enumerate(metadata["fe"]["negative"])}, orient="index" + ) + fe_negative_df.to_csv(out_dir / "ms2_db_negative.csv") + + print("Preparing positive lipid library") + if metadata["mzs"]["positive"] is not None: + metabref_positive, lipidmetadata_positive = get_lipid_library( + db_location=db_location, + mz_list=metadata["mzs"]["positive"], + polarity="positive", + mz_tol_ppm=5, + format="flashentropy", + normalize=True, + fe_kwargs={ + "normalize_intensity": True, + "min_ms2_difference_in_da": 0.02, # for cleaning spectra + "max_ms2_tolerance_in_da": 0.01, # for setting search space + "max_indexed_mz": 3000, + "precursor_ions_removal_da": None, + "noise_threshold": 0, + }, + ) + metadata["fe"]["positive"] = metabref_positive + metadata["molecular_metadata"].update(lipidmetadata_positive) + fe_positive_df = pd.DataFrame.from_dict( + {k: v for k, v in enumerate(metadata["fe"]["positive"])}, orient="index" + ) + fe_positive_df.to_csv(out_dir / "ms2_db_positive.csv") + + mol_metadata_df = pd.concat( + [ + pd.DataFrame.from_dict(v.__dict__, orient="index").transpose() + for k, v in metadata["molecular_metadata"].items() + ], + ignore_index=True, + ) + mol_metadata_df.to_csv(out_dir / "molecular_metadata.csv") + + return metadata + +def process_ms2(myLCMSobj, metadata, scan_translator): + """Process ms2 spectra and perform molecular search + + Parameters + ---------- + myLCMSobj : corems LCMS object + LCMS object to process + metadata : dict + Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively + + Returns + ------- + None, processes the LCMS object + """ + # Perform molecular search on ms2 spectra + # Grab fe from metatdata associated with polarity (this is inherently high resolution as its from a in-silico high res library) + fe_search = metadata["fe"][myLCMSobj.polarity] + + scan_dictionary = load_scan_translator(scan_translator) + ms2_scan_df = myLCMSobj.scan_df[myLCMSobj.scan_df.ms_level == 2] + + # Process high resolution MS2 scans + # Collect all high resolution MS2 scans using the scan translator + ms2_scans_oi_hr = [] + for param_key in scan_dictionary.keys(): + if scan_dictionary[param_key]["resolution"] == "high": + scan_filter = scan_dictionary[param_key]["scan_filter"] + if scan_filter is not None: + ms2_scan_df_hr = ms2_scan_df[ + ms2_scan_df.scan_text.str.contains(scan_filter) + ] + else: + ms2_scan_df_hr = ms2_scan_df + ms2_scans_oi_hr_i = [ + x for x in ms2_scan_df_hr.scan.tolist() if x in myLCMSobj._ms.keys() + ] + ms2_scans_oi_hr.extend(ms2_scans_oi_hr_i) + # Perform search on high res scans + if len(ms2_scans_oi_hr) > 0: + myLCMSobj.fe_search( + scan_list=ms2_scans_oi_hr, fe_lib=fe_search, peak_sep_da=0.01 + ) + + # Process low resolution MS2 scans + # Collect all low resolution MS2 scans using the scan translator + ms2_scans_oi_lr = [] + for param_key in scan_dictionary.keys(): + if scan_dictionary[param_key]["resolution"] == "low": + scan_filter = scan_dictionary[param_key]["scan_filter"] + if scan_filter is not None: + ms2_scan_df_lr = ms2_scan_df[ + ms2_scan_df.scan_text.str.contains(scan_filter) + ] + else: + ms2_scan_df_lr = ms2_scan_df + ms2_scans_oi_lri = [ + x for x in ms2_scan_df_lr.scan.tolist() if x in myLCMSobj._ms.keys() + ] + ms2_scans_oi_lr.extend(ms2_scans_oi_lri) + # Perform search on low res scans + if len(ms2_scans_oi_lr) > 0: + # Recast the flashentropy search database to low resolution + fe_search_lr = _to_flashentropy( + metabref_lib=fe_search, + normalize=True, + fe_kwargs={ + "normalize_intensity": True, + "min_ms2_difference_in_da": 0.4, + "max_ms2_tolerance_in_da": 0.2, + "max_indexed_mz": 3000, + "precursor_ions_removal_da": None, + "noise_threshold": 0, + }, + ) + myLCMSobj.fe_search( + scan_list=ms2_scans_oi_lr, fe_lib=fe_search_lr, peak_sep_da=0.3 + ) + +def run_lipid_ms2(out_path, metadata, scan_translator=None): + """Run ms2 spectral search and export final results + + Parameters + ---------- + out_path : str or Path + Path to output file + metadata : dict + Dict with keys "mzs", "fe", and "molecular_metadata" with values of dicts of precursor mzs (negative and positive), flash entropy search databases (negative and positive), and molecular metadata, respectively + + Returns + ------- + None, runs ms2 spectral search and exports final results + """ + # Read in the intermediate results + out_path_hdf5 = str(out_path) + ".corems/" + out_path.stem + ".hdf5" + # Catch known UserWarning from corems and ignore it + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + parser = ReadCoreMSHDFMassSpectra(out_path_hdf5) + myLCMSobj = parser.get_lcms_obj(load_raw=False) + + # Process ms2 spectra, perform spectral search, and export final results + process_ms2(myLCMSobj, metadata, scan_translator=scan_translator) + export_results(myLCMSobj, str(out_path), metadata["molecular_metadata"], final=True) def run_lcms_lipidomics_workflow( lipidomics_workflow_paramaters_file=None, - directory=None, + file_paths=None, output_directory=None, corems_toml_path=None, - metabref_token_path=None, + db_location=None, scan_translator_path=None, cores=None, ): @@ -87,26 +501,35 @@ def run_lcms_lipidomics_workflow( lipid_workflow_params = LipidomicsWorkflowParameters(**toml.load(infile)) else: lipid_workflow_params = LipidomicsWorkflowParameters( - directory=directory, + file_paths= file_paths.split(","), output_directory=output_directory, - metabref_token_path=metabref_token_path, + db_location=db_location, scan_translator_path=scan_translator_path, corems_toml_path=corems_toml_path, cores=cores, ) - # Make output dir and get list of files to process + # Make output dir out_dir = Path(lipid_workflow_params.output_directory) out_dir.mkdir(parents=True, exist_ok=True) - file_dir = Path(lipid_workflow_params.directory) - files_list = list(file_dir.glob("*.raw")) + # Check that all parameters are valid and exist + check_lipidomics_workflow_params(lipid_workflow_params) + + # Organize input and output paths + file_paths = [Path(file_path) for file_path in lipid_workflow_params.file_paths] + files_list = list(file_paths) out_paths_list = [out_dir / f.stem for f in files_list] - - # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results + + # Set the workflow parameters cores = lipid_workflow_params.cores params_toml = lipid_workflow_params.corems_toml_path scan_translator = lipid_workflow_params.scan_translator_path + + click.echo("Starting lipidomics workflow for " + str(len(files_list)) + " file(s), using " + str(cores) + " core(s)") + gc.collect() + + # Run signal processing, get associated ms1, add associated ms2, do ms1 molecular search, and export intermediate results if cores == 1 or len(files_list) == 1: mz_dicts = [] for file_in, file_out in list(zip(files_list, out_paths_list)): @@ -118,19 +541,40 @@ def run_lcms_lipidomics_workflow( ) mz_dicts.append(mz_dict) elif cores > 1: - pool = Pool(cores) - args = [ - ( - str(file_in), - str(file_out), - params_toml, - scan_translator, + with Pool(cores) as pool: + args = [ + ( + str(file_in), + str(file_out), + params_toml, + scan_translator, + ) + for file_in, file_out in zip(files_list, out_paths_list) + ] + mz_dicts = pool.starmap(run_lipid_sp_ms1, args) + gc.collect() + + # Prepare metadata for searching + click.echo("Preparing metadata for ms2 spectral search") + metadata = prep_metadata(mz_dicts, out_dir, lipid_workflow_params.db_location) + del mz_dicts + gc.collect() + + # Run ms2 spectral search and export final results + click.echo("Starting ms2 spectral search and exporting final results") + if cores == 1 or len(files_list) == 1: + for file_out in out_paths_list: + run_lipid_ms2( + file_out, metadata, scan_translator=scan_translator ) - for file_in, file_out in list(zip(files_list, out_paths_list)) - ] - mz_dicts = pool.starmap(run_lipid_sp_ms1, args) - pool.close() - pool.join() - print("Finished processing all files") + elif cores > 1: + with Pool(cores) as pool: + args = [(file_out, metadata, scan_translator) for file_out in out_paths_list] + pool.starmap(run_lipid_ms2, args) + + gc.collect() - # TODO KRH: Add full lipidomics workflow here +if __name__ == "__main__": + run_lcms_lipidomics_workflow( + lipidomics_workflow_paramaters_file="configuration/lipidomics_metams.toml" + ) \ No newline at end of file diff --git a/metaMS/lipid_metadata_prepper.py b/metaMS/lipid_metadata_prepper.py new file mode 100644 index 0000000..b33d07a --- /dev/null +++ b/metaMS/lipid_metadata_prepper.py @@ -0,0 +1,257 @@ +import pandas as pd +import numpy as np +import sqlite3 +import re +from ms_entropy import FlashEntropySearch +from corems.molecular_id.factory.lipid_molecular_metadata import LipidMetadata + + +def find_closest(A, target): + """Find the index of closest value in A to each value in target. + + Parameters + ---------- + A : :obj:`~numpy.array` + The array to search (blueprint). A must be sorted. + target : :obj:`~numpy.array` + The array of values to search for. target must be sorted. + + Returns + ------- + :obj:`~numpy.array` + The indices of the closest values in A to each value in target. + """ + idx = A.searchsorted(target) + idx = np.clip(idx, 1, len(A) - 1) + left = A[idx - 1] + right = A[idx] + idx -= target - left < right - target + return idx + +def spectrum_to_array(spectrum, normalize=True): + """ + Convert MetabRef-formatted spectrum to array. + + Parameters + ---------- + spectrum : str + MetabRef spectrum, i.e. list of (m/z,abundance) pairs. + normalize : bool + Normalize the spectrum by its magnitude. + + Returns + ------- + :obj:`~numpy.array` + Array of shape (N, 2), with m/z in the first column and abundance in + the second. + + """ + + # Convert parenthesis-delimited string to array + arr = np.array( + re.findall(r"\(([^,]+),([^)]+)\)", spectrum), dtype=float + ).reshape(-1, 2) + + # Normalize the array + if normalize: + arr[:, -1] = arr[:, -1] / arr[:, -1].sum() + + return arr + +def _to_flashentropy(metabref_lib, normalize=True, fe_kwargs={}): + """ + Convert metabref-formatted library to FlashEntropy library. + + Parameters + ---------- + metabref_lib : dict + MetabRef MS2 library in JSON format or FlashEntropy search instance (for reformatting at different MS2 separation). + normalize : bool + Normalize each spectrum by its magnitude. + fe_kwargs : dict, optional + Keyword arguments for instantiation of FlashEntropy search and building index for FlashEntropy search; + any keys not recognized will be ignored. By default, all parameters set to defaults. + + Returns + ------- + :obj:`~ms_entropy.FlashEntropySearch` + MS2 library as FlashEntropy search instance. + + Raises + ------ + ValueError + If "min_ms2_difference_in_da" or "max_ms2_tolerance_in_da" are present in `fe_kwargs` and they are not equal. + + """ + # If "min_ms2_difference_in_da" in fe_kwargs, check that "max_ms2_tolerance_in_da" is also present and that min_ms2_difference_in_da = 2xmax_ms2_tolerance_in_da + if ( + "min_ms2_difference_in_da" in fe_kwargs + or "max_ms2_tolerance_in_da" in fe_kwargs + ): + if ( + "min_ms2_difference_in_da" not in fe_kwargs + or "max_ms2_tolerance_in_da" not in fe_kwargs + ): + raise ValueError( + "Both 'min_ms2_difference_in_da' and 'max_ms2_tolerance_in_da' must be specified." + ) + if ( + fe_kwargs["min_ms2_difference_in_da"] + != 2 * fe_kwargs["max_ms2_tolerance_in_da"] + ): + raise ValueError( + "The values of 'min_ms2_difference_in_da' must be exactly 2x 'max_ms2_tolerance_in_da'." + ) + + # Initialize empty library + fe_lib = [] + + # Enumerate spectra + for i, source in enumerate(metabref_lib): + # Reorganize source dict, if necessary + if "spectrum_data" in source.keys(): + spectrum = source["spectrum_data"] + else: + spectrum = source + + # Rename precursor_mz key for FlashEntropy + if "precursor_mz" not in spectrum.keys(): + spectrum["precursor_mz"] = spectrum.pop("precursor_ion") + + # Convert CoreMS spectrum to array and clean, store as `peaks` + spectrum["peaks"] = spectrum_to_array( + spectrum["mz"], normalize=normalize + ) + + # Cast "fragment_types" to a list (if present and not already a list) + if "fragment_types" in spectrum.keys(): + if not isinstance(spectrum["fragment_types"], list): + spectrum["fragment_types"] = spectrum["fragment_types"].split(",") + + # Add spectrum to library + fe_lib.append(spectrum) + + # Initialize FlashEntropy + fe_init_kws = [ + "max_ms2_tolerance_in_da", + "mz_index_step", + "low_memory", + "path_data", + ] + fe_init_kws = {k: v for k, v in fe_kwargs.items() if k in fe_init_kws} + fes = FlashEntropySearch(**fe_init_kws) + + # Build FlashEntropy index + fe_index_kws = [ + "max_indexed_mz", + "precursor_ions_removal_da", + "noise_threshold", + "min_ms2_difference_in_da", + "max_peak_num", + ] + fe_index_kws = {k: v for k, v in fe_kwargs.items() if k in fe_index_kws} + fes.build_index(fe_lib, **fe_index_kws, clean_spectra=True) + + return fes + +def _dict_to_dataclass(metabref_lib, data_class): + """ + Convert dictionary to dataclass. + + Notes + ----- + This function will pull the attributes a dataclass and its parent class + and convert the dictionary to a dataclass instance with the appropriate + attributes. + + Parameters + ---------- + data_class : :obj:`~dataclasses.dataclass` + Dataclass to convert to. + metabref_lib : dict + Metabref dictionary object to convert to dataclass. + + Returns + ------- + :obj:`~dataclasses.dataclass` + Dataclass instance. + + """ + + # Get list of expected attributes of data_class + data_class_keys = list(data_class.__annotations__.keys()) + + # Does the data_class inherit from another class, if so, get the attributes of the parent class as well + if len(data_class.__mro__) > 2: + parent_class_keys = list(data_class.__bases__[0].__annotations__.keys()) + data_class_keys = list(set(data_class_keys + parent_class_keys)) + + # Remove keys that are not in the data_class from the input dictionary + input_dict = {k: v for k, v in metabref_lib.items() if k in data_class_keys} + + # Add keys that are in the data class but not in the input dictionary as None + for key in data_class_keys: + if key not in input_dict.keys(): + input_dict[key] = None + return data_class(**input_dict) + +def get_lipid_library( + db_location, + mz_list, + polarity, + mz_tol_ppm, + format='flashentropy', + normalize=True, + fe_kwargs={}, +): + + # prepare the mz_list for searching against the database + mz_list = pd.DataFrame(mz_list, columns=['mz_obs']) + mz_list = mz_list.sort_values(by='mz_obs') + mz_list = mz_list.reset_index(drop=True) + mz_obs_arr = mz_list['mz_obs'].values + + # connect to the database + conn = sqlite3.connect(db_location) + + # read in lipidMassSpectrumObject, get only id, polarity, and precursor_mz + mz_all = pd.read_sql_query("SELECT id, polarity, precursor_mz FROM lipidMassSpectrumObject", conn) + mz_all = mz_all.sort_values(by='precursor_mz') + + # filter by polarity and if there are any matches within mz_tol_ppm + mz_subset = mz_all[mz_all['polarity'] == polarity].copy() + mz_subset = mz_subset.sort_values(by='precursor_mz') + mz_subset = mz_subset.reset_index(drop=True) + mz_subset['closest_mz_obs'] = mz_obs_arr[ + find_closest(mz_obs_arr, mz_subset.precursor_mz.values) + ] + mz_subset['ppm_error'] = (mz_subset['precursor_mz'] - mz_subset['closest_mz_obs']) / mz_subset['precursor_mz'] * 1e6 + mz_subset = mz_subset[np.abs(mz_subset['ppm_error']) <= mz_tol_ppm] + + # get the full lipidMassSpectrumObject table for the filtered ms2 ids + mz_subset_ids = mz_subset['id'].tolist() + mz_subset_ids = tuple(mz_subset_ids) + mz_subset_full = pd.read_sql_query(f"SELECT * FROM lipidMassSpectrumObject WHERE id IN {mz_subset_ids}", conn) + + # get the lipid tree for the filtered molecular ids + mol_ids = mz_subset_full['molecular_data_id'].tolist() + mol_ids = tuple(mol_ids) + lipid_tree = pd.read_sql_query(f"SELECT * FROM lipidTree WHERE id IN {mol_ids}", conn) + + # convert molecular data to dictionary of LipidMetadata objects, with mol_id as key + lipid_tree['id_index'] = lipid_tree['id'] + lipid_tree = lipid_tree.set_index('id_index') + lipid_tree = lipid_tree.to_dict(orient='index') + lipid_metadata = { + k: _dict_to_dataclass(v, LipidMetadata) + for k, v in lipid_tree.items() + } + + # convert ms2 data to flashentropy library + mz_subset_full_dict = mz_subset_full.to_dict(orient='records') + fe_lib = _to_flashentropy(mz_subset_full_dict, normalize=normalize, fe_kwargs=fe_kwargs) + + # close the connection + conn.close() + + return fe_lib, lipid_metadata \ No newline at end of file diff --git a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py index 3b4f862..1b8f357 100644 --- a/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py +++ b/metaMS/nmdc_lipidomics_metadata_generation/api_info_retriever.py @@ -147,18 +147,20 @@ def check_if_ids_exist(self, ids: list) -> bool: If there's an error in making the API request. """ ids_test = list(set(ids)) - ids_test = [id.replace('"', "'") for id in ids_test] - ids_test_str = ", ".join(f'"{id}"' for id in ids_test) - filter_param = f'{{"id": {{"$in": [{ids_test_str}]}}}}' - og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection=id" - - try: - resp = requests.get(og_url) - resp.raise_for_status() # Raises an HTTPError for bad responses - data = resp.json() - if len(data["resources"]) != len(ids_test): - return False - except requests.RequestException as e: - raise requests.RequestException(f"Error making API request: {e}") + for id in ids_test: + filter_param = f'{{"id": "{id}"}}' + field = "id" + + og_url = f"{self.base_url}/nmdcschema/{self.collection_name}?&filter={filter_param}&projection={field}" + + try: + resp = requests.get(og_url) + resp.raise_for_status() # Raises an HTTPError for bad responses + data = resp.json() + if len(data["resources"]) == 0: + print(f"ID {id} not found") + return False + except requests.RequestException as e: + raise requests.RequestException(f"Error making API request: {e}") return True diff --git a/requirements.txt b/requirements.txt index 8a2bc52..66e74e9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ -corems>=3.0.2 +corems==3.2.0 Click>=7.1.1 -requests -nmdc-schema>=7.0.0 \ No newline at end of file +requests \ No newline at end of file diff --git a/wdl/metaMS_lcmslipidomics.wdl b/wdl/metaMS_lcmslipidomics.wdl new file mode 100644 index 0000000..a5d4bba --- /dev/null +++ b/wdl/metaMS_lcmslipidomics.wdl @@ -0,0 +1,41 @@ +version 1.0 + +workflow lcmsLipidomics { + call runMetaMSLCMSLipidomics + + output { + String out = runMetaMSLCMSLipidomics.out + Array[File] output_files = runMetaMSLCMSLipidomics.output_files + } +} + +task runMetaMSLCMSLipidomics { + input { + Array[File] file_paths + String output_directory + File corems_toml_path + File db_location + File scan_translator_path + Int cores + } + + command { + metaMS run-lipidomics-workflow \ + -i ${sep=',' file_paths} \ + -o ${output_directory} \ + -c ${corems_toml_path} \ + -d ${db_location} \ + -s ${scan_translator_path} \ + -j ${cores} + } + + output { + String out = read_string(stdout()) + Array[File] output_files = glob('${output_directory}/*') + } + + runtime { + docker: "local-metams:latest" + #TODO KRH: Update to pushed version when available + } +} \ No newline at end of file diff --git a/wdl/metaMS_lipidomics.wdl b/wdl/metaMS_lipidomics.wdl deleted file mode 100644 index dea4d11..0000000 --- a/wdl/metaMS_lipidomics.wdl +++ /dev/null @@ -1,20 +0,0 @@ -version 1.0 - -workflow lcmsLipidomics { - call runLipidomicsMetaMS -} - -task runLipidomicsMetaMS { - input { - File config_file - } - - command { - metaMS run-lipidomics-workflow -p ${config_file} - } - - runtime { - docker: "local-metams:latest" - #TODO KRH: Change to dockerhub version after we've pushed the updated image - } -} \ No newline at end of file diff --git a/wdl/metams_input_lipidomics.json b/wdl/metams_input_lipidomics.json index 056f702..c9d9796 100644 --- a/wdl/metams_input_lipidomics.json +++ b/wdl/metams_input_lipidomics.json @@ -1,3 +1,11 @@ { - "lcmsLipidomics.runLipidomicsMetaMS.config_file": "./configuration/lipidomics_metams.toml" + "lcmsLipidomics.runMetaMSLCMSLipidomics.file_paths": [ + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_POS_23Jan18_Brandi-WCSH5801.raw", + "/Users/heal742/Library/CloudStorage/OneDrive-PNNL/Documents/_DMS_data/_NMDC/_blanchard_lipidomics/Blanch_Nat_Lip_H_34_AB_M_01_NEG_25Jan18_Brandi-WCSH5801.raw" + ], + "lcmsLipidomics.runMetaMSLCMSLipidomics.output_directory": "output", + "lcmsLipidomics.runMetaMSLCMSLipidomics.corems_toml_path": "/Users/heal742/LOCAL/05_NMDC/02_MetaMS/data_processing/configurations/emsl_lipidomics_corems_params.toml", + "lcmsLipidomics.runMetaMSLCMSLipidomics.db_location": "/Users/heal742/LOCAL/05_NMDC/00_Lipid_Databse/lipid_db/lipid_ref.sqlite", + "lcmsLipidomics.runMetaMSLCMSLipidomics.scan_translator_path": "./configuration/lipid_configs/emsl_lipidomics_scan_translator.toml", + "lcmsLipidomics.runMetaMSLCMSLipidomics.cores": 1 } \ No newline at end of file