From 502b31c354c5034af6d8f8757d55cea2341c566e Mon Sep 17 00:00:00 2001
From: RalfG <ralfg@hotmail.be>
Date: Mon, 18 Sep 2023 13:41:12 +0200
Subject: [PATCH] Move parsing of PSMs to separate module

---
 ms2rescore/core.py            | 152 ++++++++++------------------------
 ms2rescore/parse_psms.py      | 119 ++++++++++++++++++++++++++
 ms2rescore/report/generate.py |  14 ++--
 3 files changed, 174 insertions(+), 111 deletions(-)
 create mode 100644 ms2rescore/parse_psms.py

diff --git a/ms2rescore/core.py b/ms2rescore/core.py
index 38940d84..0f1bd194 100644
--- a/ms2rescore/core.py
+++ b/ms2rescore/core.py
@@ -1,30 +1,29 @@
 import json
 import logging
-import re
 from multiprocessing import cpu_count
-from typing import Dict
+from typing import Dict, Optional
 
 import psm_utils.io
 from psm_utils import PSMList
 
-from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError
 from ms2rescore.feature_generators import FEATURE_GENERATORS
+from ms2rescore.parse_psms import parse_psms
 from ms2rescore.report import generate
 from ms2rescore.rescoring_engines import mokapot, percolator
 
 logger = logging.getLogger(__name__)
 
-id_file_parser = None
 
-
-def rescore(configuration: Dict, psm_list: PSMList = None) -> None:
+def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
     """
     Run full MS²Rescore workflow with passed configuration.
 
     Parameters
     ----------
     configuration
-        Dictionary containing general ms2rescore configuration.
+        Dictionary containing ms2rescore configuration.
+    psm_list
+        PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``.
 
     """
     config = configuration["ms2rescore"]  # TODO: Remove top-level key?
@@ -36,91 +35,32 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None:
 
     logger.debug("Using %i of %i available CPUs.", int(config["processes"]), int(cpu_count()))
 
-    # Read PSMs
-    logger.info("Reading PSMs...")
-    if not psm_list:
-        try:
-            psm_list = psm_utils.io.read_file(
-                config["psm_file"],
-                filetype=config["psm_file_type"],
-                show_progressbar=True,
-                **config["psm_reader_kwargs"],
-            )
-        except psm_utils.io.PSMUtilsIOException:
-            raise MS2RescoreConfigurationError(
-                "Error occurred while reading PSMs. Please check the `psm_file` and "
-                "`psm_file_type` settings. See "
-                "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
-                " for more information."
-            )
-
-    logger.debug("Finding decoys...")
-    if config["id_decoy_pattern"]:
-        psm_list.find_decoys(config["id_decoy_pattern"])
-    n_psms = len(psm_list)
-    percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100
-    logger.info(f"Found {n_psms} PSMs, of which {percent_decoys:.2f}% are decoys.")
-    if not any(psm_list["is_decoy"]):
-        raise MS2RescoreConfigurationError(
-            "No decoy PSMs found. Please check if decoys are present in the PSM file and that "
-            "the `id_decoy_pattern` option is correct. See "
-            "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
-            " for more information."
-        )
-
-    # Calculate q-values if not present
-    if None in psm_list["qvalue"]:
-        logger.debug("Recalculating q-values...")
-        psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"])
-
-    # Check #PSMs identified before rescoring
-    id_psms_before = (
-        (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False)  # noqa: E712
-    ).sum()
-    logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before)
-
-    # Store scoring values for comparison later
-    for psm in psm_list:
-        psm.provenance_data.update(
-            {
-                "before_rescoring_score": psm.score,
-                "before_rescoring_qvalue": psm.qvalue,
-                "before_rescoring_pep": psm.pep,
-                "before_rescoring_rank": psm.rank,
-            }
-        )
-
-    logger.debug("Parsing modifications...")
-    psm_list.rename_modifications(config["modification_mapping"])
-    psm_list.add_fixed_modifications(config["fixed_modifications"])
-    psm_list.apply_fixed_modifications()
+    # Parse PSMs
+    psm_list = parse_psms(config, psm_list, output_file_root)
 
-    logger.debug("Applying `psm_id_pattern`...")
-    if config["psm_id_pattern"]:
-        pattern = re.compile(config["psm_id_pattern"])
-        new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
-        psm_list["spectrum_id"] = new_ids
+    # Log #PSMs identified before rescoring
+    id_psms_before = _log_id_psms_before(psm_list)
 
-    # TODO: Temporary fix until implemented in psm_utils
-    # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
-    psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
-
-    # Add rescoring features
+    # Define feature names; get existing feature names from PSM file
     feature_names = dict()
     psm_list_feature_names = {
         feature_name
         for psm_list_features in psm_list["rescoring_features"]
         for feature_name in psm_list_features.keys()
     }
-
+    feature_names["psm_file"] = psm_list_feature_names
     logger.debug(
         f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
     )
-    feature_names["psm_file"] = psm_list_feature_names
 
+    # Add rescoring features
     for fgen_name, fgen_config in config["feature_generators"].items():
-        # TODO: Handle this somewhere else, more generally? Warning required?
+        # TODO: Handle this somewhere else, more generally?
         if fgen_name == "maxquant" and not (psm_list["source"] == "msms").all():
+            logger.warning(
+                "MaxQuant feature generator requires PSMs from a MaxQuant msms.txt file. Skipping "
+                "this feature generator."
+            )
             continue
         conf = config.copy()
         conf.update(fgen_config)
@@ -146,8 +86,8 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None:
             f"Removed {psms_with_features.count(False)} PSMs that were missing one or more "
             f"rescoring feature(s), {missing_features}."
         )
-
     psm_list = psm_list[psms_with_features]
+
     # Write feature names to file
     _write_feature_names(feature_names, output_file_root)
 
@@ -186,29 +126,17 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None:
     else:
         logger.info("No known rescoring engine specified. Skipping rescoring.")
 
-    # Compare results
-    id_psms_after = (
-        (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False)  # noqa: E712
-    ).sum()
-    diff = id_psms_after - id_psms_before
-    if id_psms_before > 0:
-        diff_perc = diff / id_psms_before
-        logger.info(f"Identified {diff} ({diff_perc:.2%}) more PSMs at 1% FDR after rescoring.")
-    else:
-        logger.info(f"Identified {diff} more PSMs at 1% FDR after rescoring.")
+    _log_id_psms_after(psm_list, id_psms_before)
 
     # Write output
     logger.info(f"Writing output to {output_file_root}.psms.tsv...")
-    psm_utils.io.write_file(
-        psm_list,
-        output_file_root + ".psms.tsv",
-        filetype="tsv",
-        show_progressbar=True,
-    )
+    psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")
 
     # Write report
     if config["write_report"]:
-        generate.generate_report(output_file_root, psm_list=psm_list, feature_names=feature_names)
+        generate.generate_report(
+            output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
+        )
 
 
 def _write_feature_names(feature_names, output_file_root):
@@ -220,13 +148,25 @@ def _write_feature_names(feature_names, output_file_root):
                 f.write(f"{fgen}\t{feature}\n")
 
 
-def _match_psm_ids(old_id, regex_pattern):
-    """Match PSM IDs to regex pattern or raise Exception if no match present."""
-    match = re.search(regex_pattern, str(old_id))
-    try:
-        return match[1]
-    except (TypeError, IndexError):
-        raise MS2RescoreError(
-            "`psm_id_pattern` could not be matched to all PSM spectrum IDs."
-            " Ensure that the regex contains a capturing group?"
-        )
+def _log_id_psms_before(psm_list):
+    """Log #PSMs identified before rescoring."""
+    id_psms_before = (
+        (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False)  # noqa: E712
+    ).sum()
+    logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before)
+    return id_psms_before
+
+
+def _log_id_psms_after(psm_list, id_psms_before):
+    """Log #PSMs identified after rescoring."""
+    id_psms_after = (
+        (psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False)  # noqa: E712
+    ).sum()
+    diff = id_psms_after - id_psms_before
+    diff_perc = diff / id_psms_before if id_psms_before > 0 else None
+
+    diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff)
+    diff_word = "more" if diff > 0 else "less"
+    logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")
+
+    return id_psms_after
diff --git a/ms2rescore/parse_psms.py b/ms2rescore/parse_psms.py
new file mode 100644
index 00000000..3eb3d19f
--- /dev/null
+++ b/ms2rescore/parse_psms.py
@@ -0,0 +1,119 @@
+import logging
+import re
+from typing import Dict, Union
+
+import psm_utils.io
+from psm_utils import PSMList
+
+from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError
+
+logger = logging.getLogger(__name__)
+
+
+def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: str) -> PSMList:
+    """
+    Parse PSMs and prepare for rescoring.
+
+    Parameters
+    ----------
+    config
+        Dictionary containing general ms2rescore configuration (everything under ``ms2rescore``
+        top-level key).
+    psm_list
+        PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``.
+    output_file_root
+        Path to output file root (without file extension).
+
+    """
+    # Read PSMs, find decoys, calculate q-values
+    psm_list = _read_psms(config, psm_list)
+    _find_decoys(config, psm_list)
+    _calculate_qvalues(config, psm_list)
+
+    # Store scoring values for comparison later
+    for psm in psm_list:
+        psm.provenance_data.update(
+            {
+                "before_rescoring_score": psm.score,
+                "before_rescoring_qvalue": psm.qvalue,
+                "before_rescoring_pep": psm.pep,
+                "before_rescoring_rank": psm.rank,
+            }
+        )
+
+    logger.debug("Parsing modifications...")
+    psm_list.rename_modifications(config["modification_mapping"])
+    psm_list.add_fixed_modifications(config["fixed_modifications"])
+    psm_list.apply_fixed_modifications()
+
+    logger.debug("Applying `psm_id_pattern`...")
+    if config["psm_id_pattern"]:
+        pattern = re.compile(config["psm_id_pattern"])
+        new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
+        psm_list["spectrum_id"] = new_ids
+
+    # TODO: Temporary fix until implemented in psm_utils
+    # Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
+    psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]
+
+    return psm_list
+
+
+def _read_psms(config, psm_list):
+    logger.info("Reading PSMs...")
+    if isinstance(psm_list, PSMList):
+        return psm_list
+    else:
+        try:
+            return psm_utils.io.read_file(
+                config["psm_file"],
+                filetype=config["psm_file_type"],
+                show_progressbar=True,
+                **config["psm_reader_kwargs"],
+            )
+        except psm_utils.io.PSMUtilsIOException:
+            raise MS2RescoreConfigurationError(
+                "Error occurred while reading PSMs. Please check the `psm_file` and "
+                "`psm_file_type` settings. See "
+                "https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
+                " for more information."
+            )
+
+
+def _find_decoys(config, psm_list):
+    """Find decoys in PSMs, log amount, and raise error if none found."""
+    logger.debug("Finding decoys...")
+    if config["id_decoy_pattern"]:
+        psm_list.find_decoys(config["id_decoy_pattern"])
+
+    n_psms = len(psm_list)
+    percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100
+    logger.info(f"Found {n_psms} PSMs, of which {percent_decoys:.2f}% are decoys.")
+
+    if not any(psm_list["is_decoy"]):
+        raise MS2RescoreConfigurationError(
+            "No decoy PSMs found. Please check if decoys are present in the PSM file and that "
+            "the `id_decoy_pattern` option is correct. See "
+            "https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
+            " for more information."
+        )
+
+
+def _calculate_qvalues(config, psm_list):
+    """Calculate q-values for PSMs if not present."""
+    # Calculate q-values if not present
+    if None in psm_list["qvalue"]:
+        logger.debug("Recalculating q-values...")
+        psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"])
+
+
+def _match_psm_ids(old_id, regex_pattern):
+    """Match PSM IDs to regex pattern or raise Exception if no match present."""
+    match = re.search(regex_pattern, str(old_id))
+    try:
+        return match[1]
+    except (TypeError, IndexError):
+        raise MS2RescoreError(
+            "`psm_id_pattern` could not be matched to all PSM spectrum IDs."
+            " Ensure that the regex contains a capturing group?"
+        )
diff --git a/ms2rescore/report/generate.py b/ms2rescore/report/generate.py
index 3472830a..8cf0d2ba 100644
--- a/ms2rescore/report/generate.py
+++ b/ms2rescore/report/generate.py
@@ -48,6 +48,7 @@ def generate_report(
     output_path_prefix: str,
     psm_list: Optional[psm_utils.PSMList] = None,
     feature_names: Optional[Dict[str, list]] = None,
+    use_txt_log: bool = False,
 ):
     """
     Generate the report.
@@ -63,9 +64,12 @@ def generate_report(
     feature_names
         Feature names to be used for the report. If not provided, the feature names will be
         read from the feature names file that matches the ``output_path_prefix``.
+    use_txt_log
+        If True, the log file will be read from ``output_path_prefix + ".log.txt"`` instead of
+        ``output_path_prefix + ".log.html"``.
 
     """
-    files = _collect_files(output_path_prefix)
+    files = _collect_files(output_path_prefix, use_txt_log=use_txt_log)
 
     # Read PSMs
     if not psm_list:
@@ -131,7 +135,7 @@ def generate_report(
     _render_and_write(output_path_prefix, **context)
 
 
-def _collect_files(output_path_prefix):
+def _collect_files(output_path_prefix, use_txt_log=False):
     """Collect all files generated by MS²Rescore."""
     logger.info("Collecting files...")
     files = {
@@ -139,9 +143,9 @@ def _collect_files(output_path_prefix):
         "configuration": Path(output_path_prefix + ".full-config.json").resolve(),
         "feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(),
         "feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(),
-        "log": Path(output_path_prefix + ".log.html").resolve()
-        if Path(output_path_prefix + ".log.html").is_file()
-        else Path(output_path_prefix + ".log.txt").resolve(),
+        "log": Path(output_path_prefix + ".log.txt").resolve()
+        if use_txt_log
+        else Path(output_path_prefix + ".log.html").resolve(),
     }
     for file, path in files.items():
         if Path(path).is_file():