From c6073e1a6ccf7114bbad19e70c128e186c485550 Mon Sep 17 00:00:00 2001
From: rodvrees <robbe.devreese@hotmail.com>
Date: Tue, 7 May 2024 17:33:08 +0200
Subject: [PATCH] Functional .msp output

---
 ms2pip/__main__.py        |   6 +-
 ms2pip/result.py          |   2 +-
 ms2pip/search_space.py    |   4 +-
 ms2pip/spectrum_output.py | 268 ++++++++++++++++++++++----------------
 4 files changed, 164 insertions(+), 116 deletions(-)

diff --git a/ms2pip/__main__.py b/ms2pip/__main__.py
index 929a489b..7268d25d 100644
--- a/ms2pip/__main__.py
+++ b/ms2pip/__main__.py
@@ -20,7 +20,7 @@
     UnresolvableModificationError,
 )
 from ms2pip.result import correlations_to_csv, results_to_csv
-from ms2pip.spectrum_output import write_single_spectrum_csv, write_single_spectrum_png
+from ms2pip.spectrum_output import write_single_spectrum_csv, write_single_spectrum_png, SpectrumOutput
 
 console = Console()
 logger = logging.getLogger(__name__)
@@ -125,6 +125,10 @@ def predict_library(*args, **kwargs):
     logger.info(f'Writing output to {output_name_csv}')
     results_to_csv(predictions, output_name_csv)
     logger.info(f'Finished writing output to {output_name_csv}')
+    #TODO: add support for other output formats
+    # Initial implementation of writing to MSP format
+    so = SpectrumOutput(predictions, normalization="basepeak_10000")
+    so.write_msp()
 
 
 
diff --git a/ms2pip/result.py b/ms2pip/result.py
index 6c5fe235..e9fe2504 100644
--- a/ms2pip/result.py
+++ b/ms2pip/result.py
@@ -138,7 +138,7 @@ def results_to_csv(results: List["ProcessingResult"], output_file: str) -> None:
                         writer.writerow(
                             {
                                 "psm_index": result.psm_index,
-                                "peptidoform": result.psm.peptidoform,
+                                "peptidoform": result.psm.peptidoform, #TODO Remove this, was for checking correctness
                                 "ion_type": ion_type,
                                 "ion_number": i + 1,
                                 "mz": "{:.6g}".format(result.theoretical_mz[ion_type][i]),
diff --git a/ms2pip/search_space.py b/ms2pip/search_space.py
index 0e282c71..a06fd512 100644
--- a/ms2pip/search_space.py
+++ b/ms2pip/search_space.py
@@ -69,7 +69,7 @@ def into_psm_list(self, min_precursor_mz = 0, max_precursor_mz = np.Inf) -> PSML
         for modifications, charge in product(self.modification_options, self.charge_options):
             offset = 0
             if not modifications:
-                psm = PSM(peptidoform=(self.sequence+'/{}'.format(charge)), spectrum_id=spectrum_id)
+                psm = PSM(peptidoform=(self.sequence+'/{}'.format(charge)), spectrum_id=spectrum_id, protein_list=self.proteins)
                 spectrum_id += 1
             else:
                 modded_sequence = list(self.sequence)
@@ -86,7 +86,7 @@ def into_psm_list(self, min_precursor_mz = 0, max_precursor_mz = np.Inf) -> PSML
                         modded_sequence.append(f"-[{mod}]")
 
                 modded_sequence = "".join(modded_sequence)
-                psm = PSM(peptidoform=(modded_sequence+'/{}'.format(charge)), spectrum_id=spectrum_id)
+                psm = PSM(peptidoform=(modded_sequence+'/{}'.format(charge)), spectrum_id=spectrum_id, protein_list=self.proteins)
                 spectrum_id += 1
             if psm.peptidoform.theoretical_mz >= min_precursor_mz and psm.peptidoform.theoretical_mz <= max_precursor_mz:
                 psms.append(psm)
diff --git a/ms2pip/spectrum_output.py b/ms2pip/spectrum_output.py
index a8db9168..2c7c9b81 100644
--- a/ms2pip/spectrum_output.py
+++ b/ms2pip/spectrum_output.py
@@ -62,6 +62,7 @@ def __init__(
         write_mode="wt+",
         return_stringbuffer=False,
         is_log_space=True,
+        normalization="basepeak_10000",
     ):
         """
         Write MS2PIP predictions to various output formats.
@@ -82,6 +83,9 @@ def __init__(
         is_log_space: bool, optional
             Set to true if predicted intensities in `all_preds` are in log-space. In that
             case, intensities will first be transformed to "normal"-space.
+        normalization: str, optional
+            Normalization method to use. Options are "basepeak_10000", "basepeak_1", and
+            "tic" (default: "basepeak_10000")
 
         Example
         -------
@@ -98,11 +102,14 @@ def __init__(
         self.write_mode = write_mode
         self.return_stringbuffer = return_stringbuffer
         self.is_log_space = is_log_space
+        self.normalization = normalization
+        self.preds_dict = None
+        # self.peprec_dict = None #TODO: Check if needed
 
         self.diff_modification_mapping = {}
+        self.has_rt = hasattr(self.results[0], "predicted_rt") # Assuming all results have the same attributes
+        self.has_protein_list = hasattr(self.results[0].psm, "protein_list") # Assuming all results have the same attributes
 
-        self.has_rt = "rt" in self.peprec.columns
-        self.has_protein_list = "protein_list" in self.peprec.columns
 
         if self.write_mode not in ["wt+", "wt", "at", "w", "a"]:
             raise InvalidWriteModeError(self.write_mode)
@@ -110,83 +117,112 @@ def __init__(
         if "a" in self.write_mode and self.return_stringbuffer:
             raise InvalidWriteModeError(self.write_mode)
 
-    # def _generate_peprec_dict(self, rt_to_seconds=True):
-    #     """
-    #     Create easy to access dict from all_preds and peprec dataframes
-    #     """
-    #     peprec_tmp = self.peprec.copy()
-
-    #     if self.has_rt and rt_to_seconds:
-    #         peprec_tmp["rt"] = peprec_tmp["rt"] * 60
-
-    #     peprec_tmp.index = peprec_tmp["spec_id"]
-    #     peprec_tmp.drop("spec_id", axis=1, inplace=True)
-
-    #     self.peprec_dict = peprec_tmp.to_dict(orient="index")
-
-    # def _generate_preds_dict(self):
-    #     """
-    #     Create easy to access dict from peprec dataframes
-    #     """
-    #     self.preds_dict = {}
-    #     preds_list = self.all_preds[
-    #         ["spec_id", "charge", "ion", "ionnumber", "mz", "prediction"]
-    #     ].values.tolist()
-
-    #     for row in preds_list:
-    #         spec_id = row[0]
-    #         if spec_id in self.preds_dict.keys():
-    #             if row[2] in self.preds_dict[spec_id]["peaks"]:
-    #                 self.preds_dict[spec_id]["peaks"][row[2]].append(tuple(row[3:]))
-    #             else:
-    #                 self.preds_dict[spec_id]["peaks"][row[2]] = [tuple(row[3:])]
-    #         else:
-    #             self.preds_dict[spec_id] = {
-    #                 "charge": row[1],
-    #                 "peaks": {row[2]: [tuple(row[3:])]},
-    #             }
-
-    # def _normalize_spectra(self, method="basepeak_10000"):
-    #     """
-    #     Normalize spectra
-    #     """
-    #     if self.is_log_space:
-    #         self.all_preds["prediction"] = ((2 ** self.all_preds["prediction"]) - 0.001).clip(
-    #             lower=0
-    #         )
-    #         self.is_log_space = False
-
-    #     if method == "basepeak_10000":
-    #         if self.normalization == "basepeak_10000":
-    #             pass
-    #         elif self.normalization == "basepeak_1":
-    #             self.all_preds["prediction"] *= 10000
-    #         else:
-    #             self.all_preds["prediction"] = self.all_preds.groupby(
-    #                 ["spec_id"], group_keys=False
-    #             )["prediction"].apply(lambda x: (x / x.max()) * 10000)
-    #         self.normalization = "basepeak_10000"
-
-    #     elif method == "basepeak_1":
-    #         if self.normalization == "basepeak_1":
-    #             pass
-    #         elif self.normalization == "basepeak_10000":
-    #             self.all_preds["prediction"] /= 10000
-    #         else:
-    #             self.all_preds["prediction"] = self.all_preds.groupby(
-    #                 ["spec_id"], group_keys=False
-    #             )["prediction"].apply(lambda x: (x / x.max()))
-    #         self.normalization = "basepeak_1"
-
-    #     elif method == "tic":
-    #         if self.normalization != "tic":
-    #             self.all_preds["prediction"] = self.all_preds.groupby(
-    #                 ["spec_id"], group_keys=False
-    #             )["prediction"].apply(lambda x: x / x.sum())
-    #         self.normalization = "tic"
-
-    #     else:
-    #         raise NotImplementedError
+    def _generate_preds_dict(self):
+        """
+        Create easy to access dict from ProcessingResult objects
+        """
+        self.preds_dict = {}
+
+        for result in self.results:
+            spec_id = result.psm_index
+            if spec_id in self.preds_dict.keys():
+                for ion_type in result.theoretical_mz.keys():
+                    if ion_type in self.preds_dict[spec_id]["peaks"]:
+                        for i in range(len(result.theoretical_mz[ion_type])):
+                            self.preds_dict[spec_id]["peaks"][ion_type].append(
+                                (
+                                    result.theoretical_mz[ion_type][i],
+                                    result.predicted_intensity[ion_type][i],
+                                )
+                            )
+                    else:
+                        self.preds_dict[spec_id]["peaks"][ion_type] = [
+                            (
+                                result.theoretical_mz[ion_type][i],
+                                result.predicted_intensity[ion_type][i],
+                            )
+                        ]
+            else:
+                self.preds_dict[spec_id] = {
+                    "peptidoform": result.psm.peptidoform,
+                    "charge": result.psm.peptidoform.precursor_charge,
+                    "peaks": {
+                        ion_type: [
+                            (
+                                result.theoretical_mz[ion_type][i],
+                                result.predicted_intensity[ion_type][i],
+                            )
+                            for i in range(len(result.theoretical_mz[ion_type]))
+                        ]
+                        for ion_type in result.theoretical_mz.keys()
+                    },
+                    "proteins": result.psm.protein_list
+                }
+
+    #TODO: Implement normalization
+    def _normalize_spectra(self, method="basepeak_10000"):
+        """
+        Normalize spectra
+        """
+        if self.is_log_space:
+            for result in self.results:
+                result.predicted_intensity = {
+                    ion_type: [
+                    max(((2 ** intensity) - 0.001), 0) for intensity in intensities
+                    ]
+                    for ion_type, intensities in result.predicted_intensity.items()
+                }
+            self.is_log_space = False
+
+        if method == "basepeak_10000":
+            if self.normalization == "basepeak_10000":
+                pass
+            elif self.normalization == "basepeak_1":
+                for result in self.results:
+                    result.predicted_intensity = {
+                        ion_type: [
+                        intensity * 10000 for intensity in intensities
+                        ]
+                        for ion_type, intensities in result.predicted_intensity.items()
+                    }
+            else:
+                for result in self.results:
+                    for ion_type, intensities in result.predicted_intensity.items():
+                        result.predicted_intensity[ion_type] = [
+                            (intensity / max(intensities)) * 10000 for intensity in intensities
+                        ]
+            self.normalization = "basepeak_10000" #needed?
+
+        elif method == "basepeak_1":
+            if self.normalization == "basepeak_1":
+                pass
+            elif self.normalization == "basepeak_10000":
+                for result in self.results:
+                    result.predicted_intensity = {
+                        ion_type: [
+                        intensity / 10000 for intensity in intensities
+                        ]
+                        for ion_type, intensities in result.predicted_intensity.items()
+                    }
+            else:
+                for result in self.results:
+                    for ion_type, intensities in result.predicted_intensity.items():
+                        result.predicted_intensity[ion_type] = [
+                            intensity / max(intensities) for intensity in intensities
+                        ]
+            self.normalization = "base_peak_1" #needed?
+
+        elif method == "tic":
+            if self.normalization != "tic":
+                for result in self.results:
+                    for ion_type, intensities in result.predicted_intensity.items():
+                        result.predicted_intensity[ion_type] = [
+                            intensity / sum(intensities) for intensity in intensities
+                        ]
+            self.normalization = "tic" #needed?
+
+        else:
+            raise NotImplementedError
 
     def _get_msp_peak_annotation(
         self,
@@ -201,44 +237,51 @@ def _get_msp_peak_annotation(
         """
         all_peaks = []
         for ion_type, peaks in peak_dict.items():
-            for peak in peaks:
-                if not include_zero and peak[2] == 0:
+
+            for peak_number, peak in enumerate(peaks):
+                if not include_zero and peak[1] == 0:
                     continue
                 if include_annotations:
                     all_peaks.append(
                         (
-                            peak[1],
-                            f'{peak[1]:.6f}{sep}{intensity_type(peak[2])}{sep}"{ion_type.lower()}{peak[0]}/0.0"',
+                            peak[0],
+                            f'{peak[0]:.6f}{sep}{intensity_type(peak[1])}{sep}"{ion_type.lower()}{peak_number+1}/0.0"',
                         )
                     )
                 else:
-                    all_peaks.append((peak[1], f"{peak[1]:.6f}{sep}{peak[2]}"))
+                    all_peaks.append((peak[0], f"{peak[0]:.6f}{sep}{peak[1]}"))
 
         all_peaks = sorted(all_peaks, key=itemgetter(0))
         peak_string = "\n".join([peak[1] for peak in all_peaks])
 
         return peak_string
 
-    def _get_msp_modifications(self, sequence, modifications):
+    def _get_msp_modifications(self, parsed_sequence, pep_properties):
         """
-        Format modifications in MSP-style, e.g. "1/0,E,Glu->pyro-Glu"
+        Format modifications in MSP-style, e.g. "1/0,E,Glu->pyro-Glu" where 1 is the number of modifications,
+        0 the position, E the amino acid, and Glu->pyro-Glu the modification.
         """
-
-        if isinstance(modifications, str):
-            if not modifications or modifications == "-":
-                msp_modifications = "0"
+        mods = []
+        counter = 0
+        for index, (aa, mod) in enumerate(parsed_sequence):
+            if not mod:
+                continue
             else:
-                mods = modifications.split("|")
-                mods = [(int(mods[i]), mods[i + 1]) for i in range(0, len(mods), 2)]
-                mods = [(x, y) if x == 0 else (x - 1, y) for (x, y) in mods]
-                mods = sorted(mods)
-                mods = [(str(x), sequence[x], y) for (x, y) in mods]
-                msp_modifications = "/".join([",".join(list(x)) for x in mods])
-                msp_modifications = f"{len(mods)}/{msp_modifications}"
+                counter += 1
+                mods.append("{},{},{}".format(index+1, aa, mod[0].name)) #Assuming only one mod per amino acid, which I guess is okay?
+        n_term_mod = pep_properties['n_term']
+        c_term_mod = pep_properties['c_term']
+        if n_term_mod:
+            counter += 1
+            mods.append("0,{},{}".format(parsed_sequence[0][0], n_term_mod))
+        if c_term_mod:
+            counter += 1
+            mods.append("-1,{},{}".format(parsed_sequence[-1][0], c_term_mod))
+
+        if counter == 0:
+            return "0"
         else:
-            msp_modifications = "0"
-
-        return msp_modifications
+            return f"{counter}/{'/'.join(sorted(mods))}"
 
     def _parse_protein_string(self, protein_list):
         """
@@ -317,12 +360,15 @@ def write_msp(self, file_object):
         Construct MSP string and write to file_object.
         """
 
-        for spec_id in sorted(self.peprec_dict.keys()):
-            seq = self.peprec_dict[spec_id]["peptide"]
-            mods = self.peprec_dict[spec_id]["modifications"]
-            charge = self.peprec_dict[spec_id]["charge"]
-            prec_mass, prec_mz = self.mods.calc_precursor_mz(seq, mods, charge)
-            msp_modifications = self._get_msp_modifications(seq, mods)
+        for spec_id in sorted(self.preds_dict.keys()):
+            seq = self.preds_dict[spec_id]["peptidoform"].sequence
+            pep_parsed_seq = self.preds_dict[spec_id]["peptidoform"].parsed_sequence
+            pep_properties = self.preds_dict[spec_id]["peptidoform"].properties
+
+            charge = self.preds_dict[spec_id]["charge"]
+            prec_mz = self.preds_dict[spec_id]["peptidoform"].theoretical_mz
+            prec_mass = self.preds_dict[spec_id]["peptidoform"].theoretical_mass
+            msp_modifications = self._get_msp_modifications(pep_parsed_seq, pep_properties)
             num_peaks = sum(
                 [len(peaklist) for _, peaklist in self.preds_dict[spec_id]["peaks"].items()]
             )
@@ -330,12 +376,12 @@ def write_msp(self, file_object):
             comment_line = f" Mods={msp_modifications} Parent={prec_mz}"
 
             if self.has_protein_list:
-                protein_list = self.peprec_dict[spec_id]["protein_list"]
+                protein_list = self.preds_dict[spec_id]["proteins"]
                 protein_string = self._parse_protein_string(protein_list)
                 comment_line += f' Protein="{protein_string}"'
 
             if self.has_rt:
-                rt = self.peprec_dict[spec_id]["rt"]
+                rt = self.preds_dict[spec_id]['predicted_rt']
                 comment_line += f" RetentionTime={rt}"
 
             comment_line += f' MS2PIP_ID="{spec_id}"'
@@ -555,10 +601,8 @@ def _write_general(
     ):
         """
         General write function to call core write functions.
-
         Note: Does not work for write_bibliospec and write_dlib functions.
         """
-
         # Normalize if necessary and make dicts
         if not self.normalization == normalization_method:
             self._normalize_spectra(method=normalization_method)
@@ -566,8 +610,8 @@ def _write_general(
                 self._generate_preds_dict()
         elif requires_dicts and not self.preds_dict:
             self._generate_preds_dict()
-        if requires_dicts and not self.peprec_dict:
-            self._generate_peprec_dict()
+        # if requires_dicts and not self.peprec_dict:
+        #     self._generate_peprec_dict()
 
         if (
             requires_diff_modifications