From 502b31c354c5034af6d8f8757d55cea2341c566e Mon Sep 17 00:00:00 2001 From: RalfG Date: Mon, 18 Sep 2023 13:41:12 +0200 Subject: [PATCH] Move parsing of PSMs to separate module --- ms2rescore/core.py | 152 ++++++++++------------------------ ms2rescore/parse_psms.py | 119 ++++++++++++++++++++++++++ ms2rescore/report/generate.py | 14 ++-- 3 files changed, 174 insertions(+), 111 deletions(-) create mode 100644 ms2rescore/parse_psms.py diff --git a/ms2rescore/core.py b/ms2rescore/core.py index 38940d84..0f1bd194 100644 --- a/ms2rescore/core.py +++ b/ms2rescore/core.py @@ -1,30 +1,29 @@ import json import logging -import re from multiprocessing import cpu_count -from typing import Dict +from typing import Dict, Optional import psm_utils.io from psm_utils import PSMList -from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError from ms2rescore.feature_generators import FEATURE_GENERATORS +from ms2rescore.parse_psms import parse_psms from ms2rescore.report import generate from ms2rescore.rescoring_engines import mokapot, percolator logger = logging.getLogger(__name__) -id_file_parser = None - -def rescore(configuration: Dict, psm_list: PSMList = None) -> None: +def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None: """ Run full MSĀ²Rescore workflow with passed configuration. Parameters ---------- configuration - Dictionary containing general ms2rescore configuration. + Dictionary containing ms2rescore configuration. + psm_list + PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``. """ config = configuration["ms2rescore"] # TODO: Remove top-level key? @@ -36,91 +35,32 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None: logger.debug("Using %i of %i available CPUs.", int(config["processes"]), int(cpu_count())) - # Read PSMs - logger.info("Reading PSMs...") - if not psm_list: - try: - psm_list = psm_utils.io.read_file( - config["psm_file"], - filetype=config["psm_file_type"], - show_progressbar=True, - **config["psm_reader_kwargs"], - ) - except psm_utils.io.PSMUtilsIOException: - raise MS2RescoreConfigurationError( - "Error occurred while reading PSMs. Please check the `psm_file` and " - "`psm_file_type` settings. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" - " for more information." - ) - - logger.debug("Finding decoys...") - if config["id_decoy_pattern"]: - psm_list.find_decoys(config["id_decoy_pattern"]) - n_psms = len(psm_list) - percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100 - logger.info(f"Found {n_psms} PSMs, of which {percent_decoys:.2f}% are decoys.") - if not any(psm_list["is_decoy"]): - raise MS2RescoreConfigurationError( - "No decoy PSMs found. Please check if decoys are present in the PSM file and that " - "the `id_decoy_pattern` option is correct. See " - "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms" - " for more information." - ) - - # Calculate q-values if not present - if None in psm_list["qvalue"]: - logger.debug("Recalculating q-values...") - psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"]) - - # Check #PSMs identified before rescoring - id_psms_before = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 - ).sum() - logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before) - - # Store scoring values for comparison later - for psm in psm_list: - psm.provenance_data.update( - { - "before_rescoring_score": psm.score, - "before_rescoring_qvalue": psm.qvalue, - "before_rescoring_pep": psm.pep, - "before_rescoring_rank": psm.rank, - } - ) - - logger.debug("Parsing modifications...") - psm_list.rename_modifications(config["modification_mapping"]) - psm_list.add_fixed_modifications(config["fixed_modifications"]) - psm_list.apply_fixed_modifications() + # Parse PSMs + psm_list = parse_psms(config, psm_list, output_file_root) - logger.debug("Applying `psm_id_pattern`...") - if config["psm_id_pattern"]: - pattern = re.compile(config["psm_id_pattern"]) - new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] - psm_list["spectrum_id"] = new_ids + # Log #PSMs identified before rescoring + id_psms_before = _log_id_psms_before(psm_list) - # TODO: Temporary fix until implemented in psm_utils - # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str) - psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]] - - # Add rescoring features + # Define feature names; get existing feature names from PSM file feature_names = dict() psm_list_feature_names = { feature_name for psm_list_features in psm_list["rescoring_features"] for feature_name in psm_list_features.keys() } - + feature_names["psm_file"] = psm_list_feature_names logger.debug( f"PSMs already contain the following rescoring features: {psm_list_feature_names}" ) - feature_names["psm_file"] = psm_list_feature_names + # Add rescoring features for fgen_name, fgen_config in config["feature_generators"].items(): - # TODO: Handle this somewhere else, more generally? Warning required? + # TODO: Handle this somewhere else, more generally? if fgen_name == "maxquant" and not (psm_list["source"] == "msms").all(): + logger.warning( + "MaxQuant feature generator requires PSMs from a MaxQuant msms.txt file. Skipping " + "this feature generator." + ) continue conf = config.copy() conf.update(fgen_config) @@ -146,8 +86,8 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None: f"Removed {psms_with_features.count(False)} PSMs that were missing one or more " f"rescoring feature(s), {missing_features}." ) - psm_list = psm_list[psms_with_features] + # Write feature names to file _write_feature_names(feature_names, output_file_root) @@ -186,29 +126,17 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None: else: logger.info("No known rescoring engine specified. Skipping rescoring.") - # Compare results - id_psms_after = ( - (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 - ).sum() - diff = id_psms_after - id_psms_before - if id_psms_before > 0: - diff_perc = diff / id_psms_before - logger.info(f"Identified {diff} ({diff_perc:.2%}) more PSMs at 1% FDR after rescoring.") - else: - logger.info(f"Identified {diff} more PSMs at 1% FDR after rescoring.") + _log_id_psms_after(psm_list, id_psms_before) # Write output logger.info(f"Writing output to {output_file_root}.psms.tsv...") - psm_utils.io.write_file( - psm_list, - output_file_root + ".psms.tsv", - filetype="tsv", - show_progressbar=True, - ) + psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv") # Write report if config["write_report"]: - generate.generate_report(output_file_root, psm_list=psm_list, feature_names=feature_names) + generate.generate_report( + output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True + ) def _write_feature_names(feature_names, output_file_root): @@ -220,13 +148,25 @@ def _write_feature_names(feature_names, output_file_root): f.write(f"{fgen}\t{feature}\n") -def _match_psm_ids(old_id, regex_pattern): - """Match PSM IDs to regex pattern or raise Exception if no match present.""" - match = re.search(regex_pattern, str(old_id)) - try: - return match[1] - except (TypeError, IndexError): - raise MS2RescoreError( - "`psm_id_pattern` could not be matched to all PSM spectrum IDs." - " Ensure that the regex contains a capturing group?" - ) +def _log_id_psms_before(psm_list): + """Log #PSMs identified before rescoring.""" + id_psms_before = ( + (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + ).sum() + logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before) + return id_psms_before + + +def _log_id_psms_after(psm_list, id_psms_before): + """Log #PSMs identified after rescoring.""" + id_psms_after = ( + (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712 + ).sum() + diff = id_psms_after - id_psms_before + diff_perc = diff / id_psms_before if id_psms_before > 0 else None + + diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff) + diff_word = "more" if diff > 0 else "less" + logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.") + + return id_psms_after diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py new file mode 100644 index 00000000..3eb3d19f --- /dev/null +++ b/ms2rescore/parse_psms.py @@ -0,0 +1,119 @@ +import logging +import re +from typing import Dict, Union + +import psm_utils.io +from psm_utils import PSMList + +from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError + +logger = logging.getLogger(__name__) + + +def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: str) -> PSMList: + """ + Parse PSMs and prepare for rescoring. + + Parameters + ---------- + config + Dictionary containing general ms2rescore configuration (everything under ``ms2rescore`` + top-level key). + psm_list + PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``. + output_file_root + Path to output file root (without file extension). + + """ + # Read PSMs, find decoys, calculate q-values + psm_list = _read_psms(config, psm_list) + _find_decoys(config, psm_list) + _calculate_qvalues(config, psm_list) + + # Store scoring values for comparison later + for psm in psm_list: + psm.provenance_data.update( + { + "before_rescoring_score": psm.score, + "before_rescoring_qvalue": psm.qvalue, + "before_rescoring_pep": psm.pep, + "before_rescoring_rank": psm.rank, + } + ) + + logger.debug("Parsing modifications...") + psm_list.rename_modifications(config["modification_mapping"]) + psm_list.add_fixed_modifications(config["fixed_modifications"]) + psm_list.apply_fixed_modifications() + + logger.debug("Applying `psm_id_pattern`...") + if config["psm_id_pattern"]: + pattern = re.compile(config["psm_id_pattern"]) + new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]] + psm_list["spectrum_id"] = new_ids + + # TODO: Temporary fix until implemented in psm_utils + # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str) + psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]] + + return psm_list + + +def _read_psms(config, psm_list): + logger.info("Reading PSMs...") + if isinstance(psm_list, PSMList): + return psm_list + else: + try: + return psm_utils.io.read_file( + config["psm_file"], + filetype=config["psm_file_type"], + show_progressbar=True, + **config["psm_reader_kwargs"], + ) + except psm_utils.io.PSMUtilsIOException: + raise MS2RescoreConfigurationError( + "Error occurred while reading PSMs. Please check the `psm_file` and " + "`psm_file_type` settings. See " + "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/" + " for more information." + ) + + +def _find_decoys(config, psm_list): + """Find decoys in PSMs, log amount, and raise error if none found.""" + logger.debug("Finding decoys...") + if config["id_decoy_pattern"]: + psm_list.find_decoys(config["id_decoy_pattern"]) + + n_psms = len(psm_list) + percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100 + logger.info(f"Found {n_psms} PSMs, of which {percent_decoys:.2f}% are decoys.") + + if not any(psm_list["is_decoy"]): + raise MS2RescoreConfigurationError( + "No decoy PSMs found. Please check if decoys are present in the PSM file and that " + "the `id_decoy_pattern` option is correct. See " + "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms" + " for more information." + ) + + +def _calculate_qvalues(config, psm_list): + """Calculate q-values for PSMs if not present.""" + # Calculate q-values if not present + if None in psm_list["qvalue"]: + logger.debug("Recalculating q-values...") + psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"]) + + +def _match_psm_ids(old_id, regex_pattern): + """Match PSM IDs to regex pattern or raise Exception if no match present.""" + match = re.search(regex_pattern, str(old_id)) + try: + return match[1] + except (TypeError, IndexError): + raise MS2RescoreError( + "`psm_id_pattern` could not be matched to all PSM spectrum IDs." + " Ensure that the regex contains a capturing group?" + ) diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py index 3472830a..8cf0d2ba 100644 --- a/ms2rescore/report/generate.py +++ b/ms2rescore/report/generate.py @@ -48,6 +48,7 @@ def generate_report( output_path_prefix: str, psm_list: Optional[psm_utils.PSMList] = None, feature_names: Optional[Dict[str, list]] = None, + use_txt_log: bool = False, ): """ Generate the report. @@ -63,9 +64,12 @@ def generate_report( feature_names Feature names to be used for the report. If not provided, the feature names will be read from the feature names file that matches the ``output_path_prefix``. + use_txt_log + If True, the log file will be read from ``output_path_prefix + ".log.txt"`` instead of + ``output_path_prefix + ".log.html"``. """ - files = _collect_files(output_path_prefix) + files = _collect_files(output_path_prefix, use_txt_log=use_txt_log) # Read PSMs if not psm_list: @@ -131,7 +135,7 @@ def generate_report( _render_and_write(output_path_prefix, **context) -def _collect_files(output_path_prefix): +def _collect_files(output_path_prefix, use_txt_log=False): """Collect all files generated by MSĀ²Rescore.""" logger.info("Collecting files...") files = { @@ -139,9 +143,9 @@ def _collect_files(output_path_prefix): "configuration": Path(output_path_prefix + ".full-config.json").resolve(), "feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(), "feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(), - "log": Path(output_path_prefix + ".log.html").resolve() - if Path(output_path_prefix + ".log.html").is_file() - else Path(output_path_prefix + ".log.txt").resolve(), + "log": Path(output_path_prefix + ".log.txt").resolve() + if use_txt_log + else Path(output_path_prefix + ".log.html").resolve(), } for file, path in files.items(): if Path(path).is_file():