Skip to content

Commit

Permalink
Move parsing of PSMs to separate module
Browse files Browse the repository at this point in the history
  • Loading branch information
RalfG committed Sep 18, 2023
1 parent f437c60 commit 502b31c
Show file tree
Hide file tree
Showing 3 changed files with 174 additions and 111 deletions.
152 changes: 46 additions & 106 deletions ms2rescore/core.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,29 @@
import json
import logging
import re
from multiprocessing import cpu_count
from typing import Dict
from typing import Dict, Optional

import psm_utils.io
from psm_utils import PSMList

from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError
from ms2rescore.feature_generators import FEATURE_GENERATORS
from ms2rescore.parse_psms import parse_psms
from ms2rescore.report import generate
from ms2rescore.rescoring_engines import mokapot, percolator

logger = logging.getLogger(__name__)

id_file_parser = None


def rescore(configuration: Dict, psm_list: PSMList = None) -> None:
def rescore(configuration: Dict, psm_list: Optional[PSMList] = None) -> None:
"""
Run full MS²Rescore workflow with passed configuration.
Parameters
----------
configuration
Dictionary containing general ms2rescore configuration.
Dictionary containing ms2rescore configuration.
psm_list
PSMList object containing PSMs. If None, PSMs will be read from configuration ``psm_file``.
"""
config = configuration["ms2rescore"] # TODO: Remove top-level key?
Expand All @@ -36,91 +35,32 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None:

logger.debug("Using %i of %i available CPUs.", int(config["processes"]), int(cpu_count()))

# Read PSMs
logger.info("Reading PSMs...")
if not psm_list:
try:
psm_list = psm_utils.io.read_file(
config["psm_file"],
filetype=config["psm_file_type"],
show_progressbar=True,
**config["psm_reader_kwargs"],
)
except psm_utils.io.PSMUtilsIOException:
raise MS2RescoreConfigurationError(
"Error occurred while reading PSMs. Please check the `psm_file` and "
"`psm_file_type` settings. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
" for more information."
)

logger.debug("Finding decoys...")
if config["id_decoy_pattern"]:
psm_list.find_decoys(config["id_decoy_pattern"])
n_psms = len(psm_list)
percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100
logger.info(f"Found {n_psms} PSMs, of which {percent_decoys:.2f}% are decoys.")
if not any(psm_list["is_decoy"]):
raise MS2RescoreConfigurationError(
"No decoy PSMs found. Please check if decoys are present in the PSM file and that "
"the `id_decoy_pattern` option is correct. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
" for more information."
)

# Calculate q-values if not present
if None in psm_list["qvalue"]:
logger.debug("Recalculating q-values...")
psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"])

# Check #PSMs identified before rescoring
id_psms_before = (
(psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
).sum()
logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before)

# Store scoring values for comparison later
for psm in psm_list:
psm.provenance_data.update(
{
"before_rescoring_score": psm.score,
"before_rescoring_qvalue": psm.qvalue,
"before_rescoring_pep": psm.pep,
"before_rescoring_rank": psm.rank,
}
)

logger.debug("Parsing modifications...")
psm_list.rename_modifications(config["modification_mapping"])
psm_list.add_fixed_modifications(config["fixed_modifications"])
psm_list.apply_fixed_modifications()
# Parse PSMs
psm_list = parse_psms(config, psm_list, output_file_root)

logger.debug("Applying `psm_id_pattern`...")
if config["psm_id_pattern"]:
pattern = re.compile(config["psm_id_pattern"])
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids
# Log #PSMs identified before rescoring
id_psms_before = _log_id_psms_before(psm_list)

# TODO: Temporary fix until implemented in psm_utils
# Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]

# Add rescoring features
# Define feature names; get existing feature names from PSM file
feature_names = dict()
psm_list_feature_names = {
feature_name
for psm_list_features in psm_list["rescoring_features"]
for feature_name in psm_list_features.keys()
}

feature_names["psm_file"] = psm_list_feature_names
logger.debug(
f"PSMs already contain the following rescoring features: {psm_list_feature_names}"
)
feature_names["psm_file"] = psm_list_feature_names

# Add rescoring features
for fgen_name, fgen_config in config["feature_generators"].items():
# TODO: Handle this somewhere else, more generally? Warning required?
# TODO: Handle this somewhere else, more generally?
if fgen_name == "maxquant" and not (psm_list["source"] == "msms").all():
logger.warning(
"MaxQuant feature generator requires PSMs from a MaxQuant msms.txt file. Skipping "
"this feature generator."
)
continue
conf = config.copy()
conf.update(fgen_config)
Expand All @@ -146,8 +86,8 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None:
f"Removed {psms_with_features.count(False)} PSMs that were missing one or more "
f"rescoring feature(s), {missing_features}."
)

psm_list = psm_list[psms_with_features]

# Write feature names to file
_write_feature_names(feature_names, output_file_root)

Expand Down Expand Up @@ -186,29 +126,17 @@ def rescore(configuration: Dict, psm_list: PSMList = None) -> None:
else:
logger.info("No known rescoring engine specified. Skipping rescoring.")

# Compare results
id_psms_after = (
(psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
).sum()
diff = id_psms_after - id_psms_before
if id_psms_before > 0:
diff_perc = diff / id_psms_before
logger.info(f"Identified {diff} ({diff_perc:.2%}) more PSMs at 1% FDR after rescoring.")
else:
logger.info(f"Identified {diff} more PSMs at 1% FDR after rescoring.")
_log_id_psms_after(psm_list, id_psms_before)

# Write output
logger.info(f"Writing output to {output_file_root}.psms.tsv...")
psm_utils.io.write_file(
psm_list,
output_file_root + ".psms.tsv",
filetype="tsv",
show_progressbar=True,
)
psm_utils.io.write_file(psm_list, output_file_root + ".psms.tsv", filetype="tsv")

# Write report
if config["write_report"]:
generate.generate_report(output_file_root, psm_list=psm_list, feature_names=feature_names)
generate.generate_report(
output_file_root, psm_list=psm_list, feature_names=feature_names, use_txt_log=True
)


def _write_feature_names(feature_names, output_file_root):
Expand All @@ -220,13 +148,25 @@ def _write_feature_names(feature_names, output_file_root):
f.write(f"{fgen}\t{feature}\n")


def _match_psm_ids(old_id, regex_pattern):
"""Match PSM IDs to regex pattern or raise Exception if no match present."""
match = re.search(regex_pattern, str(old_id))
try:
return match[1]
except (TypeError, IndexError):
raise MS2RescoreError(
"`psm_id_pattern` could not be matched to all PSM spectrum IDs."
" Ensure that the regex contains a capturing group?"
)
def _log_id_psms_before(psm_list):
"""Log #PSMs identified before rescoring."""
id_psms_before = (
(psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
).sum()
logger.info("Found %i identified PSMs at 1%% FDR before rescoring.", id_psms_before)
return id_psms_before


def _log_id_psms_after(psm_list, id_psms_before):
"""Log #PSMs identified after rescoring."""
id_psms_after = (
(psm_list["qvalue"] <= 0.01) & (psm_list["is_decoy"] == False) # noqa: E712
).sum()
diff = id_psms_after - id_psms_before
diff_perc = diff / id_psms_before if id_psms_before > 0 else None

diff_numbers = f"{diff} ({diff_perc:.2%})" if diff_perc is not None else str(diff)
diff_word = "more" if diff > 0 else "less"
logger.info(f"Identified {diff_numbers} {diff_word} PSMs at 1% FDR after rescoring.")

return id_psms_after
119 changes: 119 additions & 0 deletions ms2rescore/parse_psms.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import logging
import re
from typing import Dict, Union

import psm_utils.io
from psm_utils import PSMList

from ms2rescore.exceptions import MS2RescoreConfigurationError, MS2RescoreError

logger = logging.getLogger(__name__)


def parse_psms(config: Dict, psm_list: Union[PSMList, None], output_file_root: str) -> PSMList:
"""
Parse PSMs and prepare for rescoring.
Parameters
----------
config
Dictionary containing general ms2rescore configuration (everything under ``ms2rescore``
top-level key).
psm_list
PSMList object containing PSMs. If None, PSMs will be read from ``psm_file``.
output_file_root
Path to output file root (without file extension).
"""
# Read PSMs, find decoys, calculate q-values
psm_list = _read_psms(config, psm_list)
_find_decoys(config, psm_list)
_calculate_qvalues(config, psm_list)

# Store scoring values for comparison later
for psm in psm_list:
psm.provenance_data.update(
{
"before_rescoring_score": psm.score,
"before_rescoring_qvalue": psm.qvalue,
"before_rescoring_pep": psm.pep,
"before_rescoring_rank": psm.rank,
}
)

logger.debug("Parsing modifications...")
psm_list.rename_modifications(config["modification_mapping"])
psm_list.add_fixed_modifications(config["fixed_modifications"])
psm_list.apply_fixed_modifications()

logger.debug("Applying `psm_id_pattern`...")
if config["psm_id_pattern"]:
pattern = re.compile(config["psm_id_pattern"])
new_ids = [_match_psm_ids(old_id, pattern) for old_id in psm_list["spectrum_id"]]
psm_list["spectrum_id"] = new_ids

# TODO: Temporary fix until implemented in psm_utils
# Ensure that spectrum IDs are strings (Pydantic 2.0 does not coerce int to str)
psm_list["spectrum_id"] = [str(spec_id) for spec_id in psm_list["spectrum_id"]]

return psm_list


def _read_psms(config, psm_list):
logger.info("Reading PSMs...")
if isinstance(psm_list, PSMList):
return psm_list
else:
try:
return psm_utils.io.read_file(
config["psm_file"],
filetype=config["psm_file_type"],
show_progressbar=True,
**config["psm_reader_kwargs"],
)
except psm_utils.io.PSMUtilsIOException:
raise MS2RescoreConfigurationError(
"Error occurred while reading PSMs. Please check the `psm_file` and "
"`psm_file_type` settings. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/input-files/"
" for more information."
)


def _find_decoys(config, psm_list):
"""Find decoys in PSMs, log amount, and raise error if none found."""
logger.debug("Finding decoys...")
if config["id_decoy_pattern"]:
psm_list.find_decoys(config["id_decoy_pattern"])

n_psms = len(psm_list)
percent_decoys = sum(psm_list["is_decoy"]) / n_psms * 100
logger.info(f"Found {n_psms} PSMs, of which {percent_decoys:.2f}% are decoys.")

if not any(psm_list["is_decoy"]):
raise MS2RescoreConfigurationError(
"No decoy PSMs found. Please check if decoys are present in the PSM file and that "
"the `id_decoy_pattern` option is correct. See "
"https://ms2rescore.readthedocs.io/en/latest/userguide/configuration/#selecting-decoy-psms"
" for more information."
)


def _calculate_qvalues(config, psm_list):
"""Calculate q-values for PSMs if not present."""
# Calculate q-values if not present
if None in psm_list["qvalue"]:
logger.debug("Recalculating q-values...")
psm_list.calculate_qvalues(reverse=not config["lower_score_is_better"])


def _match_psm_ids(old_id, regex_pattern):
"""Match PSM IDs to regex pattern or raise Exception if no match present."""
match = re.search(regex_pattern, str(old_id))
try:
return match[1]
except (TypeError, IndexError):
raise MS2RescoreError(
"`psm_id_pattern` could not be matched to all PSM spectrum IDs."
" Ensure that the regex contains a capturing group?"
)
14 changes: 9 additions & 5 deletions ms2rescore/report/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def generate_report(
output_path_prefix: str,
psm_list: Optional[psm_utils.PSMList] = None,
feature_names: Optional[Dict[str, list]] = None,
use_txt_log: bool = False,
):
"""
Generate the report.
Expand All @@ -63,9 +64,12 @@ def generate_report(
feature_names
Feature names to be used for the report. If not provided, the feature names will be
read from the feature names file that matches the ``output_path_prefix``.
use_txt_log
If True, the log file will be read from ``output_path_prefix + ".log.txt"`` instead of
``output_path_prefix + ".log.html"``.
"""
files = _collect_files(output_path_prefix)
files = _collect_files(output_path_prefix, use_txt_log=use_txt_log)

# Read PSMs
if not psm_list:
Expand Down Expand Up @@ -131,17 +135,17 @@ def generate_report(
_render_and_write(output_path_prefix, **context)


def _collect_files(output_path_prefix):
def _collect_files(output_path_prefix, use_txt_log=False):
"""Collect all files generated by MS²Rescore."""
logger.info("Collecting files...")
files = {
"PSMs": Path(output_path_prefix + ".psms.tsv").resolve(),
"configuration": Path(output_path_prefix + ".full-config.json").resolve(),
"feature names": Path(output_path_prefix + ".feature_names.tsv").resolve(),
"feature weights": Path(output_path_prefix + ".mokapot.weights.tsv").resolve(),
"log": Path(output_path_prefix + ".log.html").resolve()
if Path(output_path_prefix + ".log.html").is_file()
else Path(output_path_prefix + ".log.txt").resolve(),
"log": Path(output_path_prefix + ".log.txt").resolve()
if use_txt_log
else Path(output_path_prefix + ".log.html").resolve(),
}
for file, path in files.items():
if Path(path).is_file():
Expand Down

0 comments on commit 502b31c

Please sign in to comment.