From d7668102e31a1597b5d3a9290ad7dd118b5ed768 Mon Sep 17 00:00:00 2001 From: unknown Date: Sat, 3 Feb 2024 16:31:40 +0000 Subject: [PATCH] Some docstrings for peak-picking utilities. --- vimms/PeakPicking.py | 104 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/vimms/PeakPicking.py b/vimms/PeakPicking.py index 4dcbdbd..7712aa9 100644 --- a/vimms/PeakPicking.py +++ b/vimms/PeakPicking.py @@ -41,6 +41,20 @@ def report_boxes(cls, output_path): @dataclass class MZMineParams(AbstractParams): + """ + Wrapper class to run MZMine 2 peak-picking from the ViMMS codebase. + MZMine 2 allows commands for its processing pipeline to be stored in an .xml + and then run via command line using its "batch mode" executable. Given an + appropriate "template" .xml this class will substitute input and output file + names into it and then run it in batch mode via subprocess. + + NOTE: MZMine is not installed with ViMMS. It must be installed separately + and the path to the "batch mode" executable specified for this class. + + Args: + mzmine_template: Path to .xml template giving batch commands. + mzmine_exe: Path to batch mode executable. + """ method_name = "MZMine" RT_FACTOR = 60 #minutes @@ -78,6 +92,19 @@ def _make_batch_file(self, input_files, output_dir, output_name, output_path): return new_xml def pick_aligned_peaks(self, input_files, output_dir, output_name, force=False): + """ + Run MZMine batch mode file for a list of input files. + + Args: + input_files: Iterable of paths to input files. + output_dir: Directory to write output to. + output_name: Name for output file. Some text and the file extension + are added automatically. + force: When False, don't run peak-picking if a file already exists + at the output destination. + + Returns: Full path the output file was written to. + """ input_files = list(set(input_files)) #filter duplicates output_path = self.format_output_path(output_dir, output_name) @@ -91,6 +118,22 @@ def pick_aligned_peaks(self, input_files, output_dir, output_name, force=False): @staticmethod def check_files_match(fullscan_names, aligned_path, mode="subset"): + """ + Check that the source files listed in the header of a peak-picking + output match an input list. + + Args: + fullscan_names: List of .mzml files (or paths to them) to look + for in the header of the aligned file. + aligned_path: Full filepath to the aligned file. + mode: "subset" just checks if all fullscan_names can be found in + the header. "exact" checks whether or not the two sets of + names exactly match. + + Returns: Tuple of boolean reporting whether test succeeded, the + names of the fullscans given as input, and the names of files + found in the header. + """ fs_names = {os.path.basename(fs) for fs in fullscan_names} mzmine_names = set() @@ -116,6 +159,23 @@ def check_files_match(fullscan_names, aligned_path, mode="subset"): @staticmethod def read_aligned_csv(box_file_path): + """ + Parse in an aligned boxfile in MZMine 2 format. Each column + in an aligned boxfile either has properties related to the whole + row (e.g. average m/z of the peak aligned on that row) or a property + specific property of an unaligned peak from a parent .mzML. Row + properties are parsed into a list of dictionaries (one dictionary + per row) in the form [{property_name: value}, ...]. .mzML properties + are loaded into a similar list but with a nested dictionary + i.e. [{mzml_name: {property_name: value}}, ...]. + + Args: + box_file_path: Full path to the aligned boxfile. + + Returns: Tuple of .mzML names and iterable of pairs of row dicts + and .mzML dicts. + """ + row_headers = [ "row ID", "row m/z", @@ -156,6 +216,10 @@ def pick_aligned_peaks(input_files, mzmine_template, mzmine_exe, force=False): + """ + Convenience function (for backwards compatibility) which picks + peaks using MZMineParams. + """ params = MZMineParams(mzmine_template, mzmine_exe) return params.pick_aligned_peaks(input_files, output_dir, output_name, force=force) @@ -163,6 +227,26 @@ def pick_aligned_peaks(input_files, @dataclass class XCMSScriptParams(AbstractParams): + """ + Wrapper class to run XCMS scripts written in R from ViMMS. The R script + is run via subprocess and is given all arguments specified in the object + instance as command-line arguments - the R script must handle any that + are not None. XCMS does not natively write out aligned peaks so methods + for reading output files assume they were written in the same format as + MZMineParams. + + NOTE: R and XCMS are not installed with ViMMS. They must be installed + separately and the paths to both the Rscript utility and the XCMS + script to run must be specified for this class. + + Args: + xcms_r_script: Path to the XCMS script written in R which should + be run. + rscript_exe: Path to the "Rscript" utility packaged with R. By + default assumes it can be found via the "Rscript" environment + variable. + others: See xcms documentation for details. + """ #TODO: It would be good to just call the R functions from Python #instead of calling an external R script... @@ -191,6 +275,20 @@ class XCMSScriptParams(AbstractParams): kNN: int = None def pick_aligned_peaks(self, input_files, output_dir, output_name, force=False): + """ + Run XCMS script for a list of input files. + + Args: + input_files: Iterable of paths to input files. + output_dir: Directory to write output to. + output_name: Name for output file. Some text and the file extension + are added automatically. + force: When False, don't run peak-picking if a file already exists + at the output destination. + + Returns: Full path the output file was written to. + """ + input_files = list(set(input_files)) #filter duplicates output_path = self.format_output_path(output_dir, output_name) @@ -214,8 +312,14 @@ def pick_aligned_peaks(self, input_files, output_dir, output_name, force=False): @staticmethod def check_files_match(fullscan_names, aligned_path, mode="subset"): + """ + Wrapper to MZMineParams' "check_files_match". + """ return MZMineParams.check_files_match(fullscan_names, aligned_path, mode=mode) @staticmethod def read_aligned_csv(box_file): + """ + Wrapper to MZMineParams' "read_aligned_csv". + """ return MZMineParams.read_aligned_csv(box_file) \ No newline at end of file