From ed0eaaad5318dd90613e1364c6eb7d6803025d2a Mon Sep 17 00:00:00 2001 From: tmcgowan Date: Mon, 28 Sep 2020 14:59:15 -0500 Subject: [PATCH] initial --- .gitignore | 6 + README.md | 32 ++ psmfragmentation/__init__.py | 0 psmfragmentation/psmfragmentation.py | 517 +++++++++++++++++++++++++++ requirements.txt | 19 + setup.py | 28 ++ tests/data/peptide_sequences.txt | 9 + 7 files changed, 611 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 psmfragmentation/__init__.py create mode 100644 psmfragmentation/psmfragmentation.py create mode 100644 requirements.txt create mode 100644 setup.py create mode 100644 tests/data/peptide_sequences.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a260780 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.vscode/* +venv/* +LargeDataSet.mz.sqlite +*.png +*.svg +*.html diff --git a/README.md b/README.md new file mode 100644 index 0000000..667e539 --- /dev/null +++ b/README.md @@ -0,0 +1,32 @@ +PSM Validation Tool +=================== + +The PSM validation tool verifies PSM scans against the ion fragmentation of the associated peptide squence. + +Inputs +------ + +- MZSQLite database generated in a Galaxy workflow +- Tabular file of peptide sequences + +By default the tool will fragment peptde into + +- b and y ions +- b-H2O, b-NH3, y-H2O, y-NH3 +- Internals + +You can disable internals and neutral loss ions. + +Output +------ + +A self-contained HTML report that includes: + +- All PSMs for peptides of interest with PSM identifiers +- PSM PeptideShaker scores are presented +- MSMS interactive graphs with + - PSM scan mz and intensities + - All matching ion fragments +- Ion fragment table + - Color coded for matching fragments + - Internal values and color coded for internal matches \ No newline at end of file diff --git a/psmfragmentation/__init__.py b/psmfragmentation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/psmfragmentation/psmfragmentation.py b/psmfragmentation/psmfragmentation.py new file mode 100644 index 0000000..90e2ec3 --- /dev/null +++ b/psmfragmentation/psmfragmentation.py @@ -0,0 +1,517 @@ +import datetime +from datetime import timezone +import logging +import os +from os import POSIX_FADV_NOREUSE +import re +import sqlite3 +import sys + +import plotly.io +import plotly.graph_objects as go + +from pyteomics import mass + +logger = logging.getLogger(__name__) +logger.setLevel(level=os.environ.get("LOGLEVEL", "INFO")) +handler = logging.StreamHandler() +formatter = logging.Formatter("%(asctime)s %(name)-12s %(levelname)-8s %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +this = sys.modules[__name__] + +#for plotly +this.plotly_touched = False + +#for html report - includes bulma css in header. +# region +this.html_slug = """ + + + + + + PSM/Ion Fragment Report + + + +
+
+
+

+ PSM - Ion Fragment Report +

+

+ ##RUNDATE## +

+
+
+
+ +
+
+ ##PEPTIDE## +
+
+ + + """ + # endregion + +#Pyteomics config for modifications +this.unimod_db = mass.Unimod() +this.custom_aa_comp = dict(mass.std_aa_comp) + +def _to_unimod(sequence): + """ + Convert encoded peptide to Unimod style + + Parameters + ---------- + - sequence: str peptide sequence + + Returns + ------ + + - dictionary + key = modified aa offset + value = unimod str + 'unimod': {0:'mO'}, + 'text': {'mO': 'Oxidation'} + """ + r_val = { + 'unimod': {}, + 'text': {} + } + #populate the custom db + mods = re.findall("{([A-Z]:\w*)}", sequence) # n is list of ['N:Deamidated', 'M:Oxidation'] + tmp = None + for mod in mods: + tmp = mod.split(':') + this.custom_aa_comp[tmp[0].lower()+ tmp[1][:1].upper()] = this.unimod_db.by_title(tmp[1])['composition'] + r_val['text'][tmp[0].lower()+ tmp[1][:1].upper()] = tmp[1] + pattern = re.compile('{([A-Z]):\w*}') + match = re.search(pattern, sequence) + t1 = sequence[0:match.span()[0]] + match.groups()[0] + sequence[match.span()[1]:] + r_val['unimod'][match.span()[0]] = tmp[0].lower()+ tmp[1][:1].upper() + + return r_val + + +def _fragment_ions(peptide, itype='b', maxcharge=1, encodedsequence=None): + """ + Fragment peptide sequence + + Parameters + ---------- + - peptide: str sequence + - itype: str ion type defaults to 'b' Can be one of ('a', 'b', 'c', 'x', 'y', 'z', 'M') + - maxcharge: int ion charge + - encodedsequence: str peptide sequence with modifications ELVIS{M:Oxidation}LIVES + NB: to specify a neutral loss, add the loss to the itype 'b' goes to 'b-H2O' or 'b-NH3' + + NB: Ion type set to 'M' will produce internal fragmentation + + Returns + ------- + List of ion mz values + """ + r_val = [] + pep_list = list(peptide) + + if encodedsequence: + e_sequence = _to_unimod(encodedsequence)['unimod'] + for key in e_sequence.keys(): + pep_list[key] = e_sequence[key] + + if itype == 'M': + # Internals + for aa in pep_list: + r_val.append(mass.calculate_mass(aa, ion_type="M", charge=maxcharge, aa_comp=this.custom_aa_comp)) + return r_val + + for i in range(1, len(peptide) + 1): + if re.findall('[abc]', itype): + r_val.append(mass.calculate_mass("".join(pep_list[:i]), ion_type=itype, charge=maxcharge, aa_comp=this.custom_aa_comp)) + else: + r_val.append(mass.calculate_mass("".join(pep_list[(i-1):]), ion_type=itype, charge=maxcharge, aa_comp=this.custom_aa_comp)) + return r_val + + +def _generate_ion_plot(peptide, psm_values, fragment_mz, matched_mz): + """ + Use plotly to create an interactive fragmentation plot + + Parameters + ---------- + - peptide: str + - psm values: list of PSM mz and intensities + - fragment mz: list of ion fragment mz + - matched mz: list of mz values that match between PSM and Ion + + Returns + ------- + HTML for interactive plotly plot + + """ + colors = {'y': "rgba(255,0,0,#OP#)", 'y-H2O': 'rgb(204, 0, 0,#OP#)', 'y-NH3': 'rgb(210, 85, 85,#OP#)', + 'b': 'rgba(0,0,255,#OP#)', 'b-H2O': 'rgb(102, 127, 153,#OP#)', 'b-NH3': 'rgb(69, 86, 104,#OP#)', 'M': 'rgb(0, 255, 0, #OP#)'} + + layout = go.Layout(xaxis=({'title':'MZ Values'}), title=peptide, margin=dict( + l=10, + r=10, + b=10, + t=50, + pad=4 + ), height=500, width=800) + + fig = go.Figure(layout=layout) + + max_psm_intensity = max(psm_values['intensity'])/ 2.0 + + for ion_type in fragment_mz.keys(): + data_y = [max_psm_intensity, ] * len(fragment_mz[ion_type]) + data_x = fragment_mz[ion_type] + #data_x.sort() + frag_colors = [] + bar_width = [] + hover_text = [] + + truncated_data = [] + for idx in range(len(data_x)): + if data_x[idx] in matched_mz[ion_type]: + frag_colors.append(colors[ion_type].replace("#OP#", "1")) + bar_width.append(1) + hover_text.append('Fragment Match') + truncated_data.append(data_x[idx]) + else: + #Do not draw fragments with no match + pass + + ion_name = ion_type + ' Ion' + if ion_type == 'M': + ion_name = "Internals" + i_trace = go.Bar(x=truncated_data, y=data_y, marker_color=frag_colors, name=ion_name, hovertext=hover_text, width=bar_width) + fig.add_trace(i_trace) + + psm_color = ['black'] * len(psm_values['mzvalues']) + psm_trace = go.Bar(x=psm_values['mzvalues'], y=psm_values['intensity'], + marker_color=psm_color, width=1, opacity=1.0, name="PSM Value") + fig.add_trace(psm_trace) + + if this.plotly_touched: + return plotly.io.to_html(fig, full_html=False, include_plotlyjs=False) + else: + this.plotly_touched = True + return plotly.io.to_html(fig, full_html=False) + + +def _match_mzs(scan, fragment, delta=0.5): + """ + Matches MZ values in fragmentation and PSM scans + + Parameters + ---------- + - scan: list of PSM scans + - fragment: list ion fragments + - delta: float acceptable epsilon + + Returns + ------- + - matched mz: list of matching mz values + """ + matched_mz = [] + for s in scan: + for f in fragment: + diff = abs(s - f) + if (diff <= delta): + matched_mz.append(f) + return matched_mz + + +def _q_quote(str): + return '"' + str + '"' + + +def _peptides(db, seq_list): + """ + Query MZSQLite db for peptides + + Parameters + ---------- + - db: path to database + - sql list: list of peptide sequences + + Returns + ------- + - d dict of key: pep sequence value: tuple (peptide id, pep sequence, seqeunce with mods) + """ + logger.info("Querying for peptide entries based on target sequences") + conn = sqlite3.connect(db) + cursor = conn.cursor() + in_clause = "(" + ",".join(list(map(_q_quote, seq_list))) + ")" + sql_q = f"SELECT * FROM peptides WHERE sequence IN {in_clause}" + peps = cursor.execute(sql_q).fetchall() + d = dict() + for pep in peps: + d.setdefault(pep[1], pep) + return d + + +def _psm(db, pep_seq): + """ + Query MZSQLite db for PSMs + + Parameters + ---------- + - db: path to database + - pep seq: str peptide sequence + + Returns + ------- + - list of psms associated with the target sequence""" + logger.info(f"Query for PSM objects based on sequence {pep_seq}") + conn = sqlite3.connect(db) + cursor = conn.cursor() + sql_q = f"SELECT id, passThreshold, spectrumID, spectrumTitle, \"PeptideShaker PSM score\", \"PeptideShaker PSM confidence\", passThreshold FROM psm_entries WHERE sequence = {_q_quote(pep_seq)}" + psms = cursor.execute(sql_q).fetchall() + r_val = [] + for psm in psms: + o = dict() + o.setdefault("ID", psm[0]) + o.setdefault("passThreshold", psm[1]) + o.setdefault("spectrumID", psm[2]) + o.setdefault("spectrumTitle", psm[3]) + o.setdefault("PSPSMScore", psm[4]) + o.setdefault("PSPSMConfidence", psm[5]) + o.setdefault("PassThreshold", psm[6]) + r_val.append(o) + return r_val + + +def _scans(db, spectrumID): + """ + Query MZSQLite db for scans + + Parameters + ---------- + - db: path to database + - spectrunID: str scan identifier + + Returns + ------- + - dict of scan""" + logger.info(f"Query for scans based on {spectrumID}") + conn = sqlite3.connect(db) + cursor = conn.cursor() + sql_q = f"SELECT * from scans WHERE spectrumID = {_q_quote(spectrumID)}" + scan = cursor.execute(sql_q).fetchone() #SB only one + s = dict() + s.setdefault("spectrumID", scan[0]) + s.setdefault("spectrumTitle", scan[1]) + s.setdefault("mzValues", scan[4]) + s.setdefault("intensities", scan[5]) + return s + + +def _to_float(s): + tmp = s.lstrip('[').rstrip(']').split(',') + num_vals = list(map(float, tmp)) + return num_vals + + +def write_report(s): + with open('index.html', 'w') as f: + f.write(s) + + +def _create_table(sequence, matchedpeaks, ionfragments, maxcharge, encodedsequence=None): + """ + Create ion table for HTML page + + Parameters + ---------- + - sequence: str peptide sequence + - matchedpeaks: list of matching peaks + - ionfragments: list + - maxcharge: ion charge + - encodedsequence: str if peptide is modified + + Returns + ------- + HTML string + + """ + b_re = re.compile('^b.*') + y_re = re.compile('^y.*') + + b_keys = list(filter(lambda key: b_re.match(key), ionfragments.keys())) + y_keys = list(filter(lambda key: y_re.match(key), ionfragments.keys())) + + r_val = "" + + for k in b_keys: + r_val += f"" + r_val += "" + + for k in y_keys: + r_val += f"" + + r_val += "" + + pep_seq = list(sequence) + mod_text = None + if encodedsequence: + tmp_m = _to_unimod(encodedsequence) + mods = tmp_m['unimod'] + mod_text = tmp_m['text'] + for key in mods.keys(): + pep_seq[key] = mods[key] + + for i in range(0, len(sequence)): + table_line = "" + for bk in b_keys: + if ionfragments[bk][i] in matchedpeaks[bk]: + table_line += f"" + else: + table_line += f"" + + m_wt = None + b_color = "style='background-color: hsl(0, 0%, 96%)';" + if 'M' in ionfragments.keys(): + m_wt = ionfragments['M'][i] + if ionfragments['M'][i] in matchedpeaks['M']: + b_color = "style='background-color: rgb(0, 255, 0);'" + + if mod_text and pep_seq[i] in mod_text: + tip_text = f"MOD: {mod_text[pep_seq[i]]}" + if m_wt: + tip_text += f" MZ: {m_wt:.3f}" + table_line += f"" + else: + if m_wt: + table_line += f"" + else: + table_line += f"" + + for yk in y_keys: + if ionfragments[yk][i] in matchedpeaks[yk]: + table_line += f"" + else: + table_line += f"" + table_line += "" + r_val += table_line + r_val += f"
{k}{k}
{ionfragments[bk][i]:.4f}{ionfragments[bk][i]:.4f}{pep_seq[i]}{pep_seq[i]}{pep_seq[i]}{ionfragments[yk][i]:.4f}{ionfragments[yk][i]:.4f}

Charge: {maxcharge}

" + + return r_val + + +def _write_psm_details(psm): + r_val= "

PSM Details

" + r_val += f"
Spectrum Index
{psm['spectrumID']}
Spectrum Title
{psm['spectrumTitle']}
" + r_val += f"
PeptideShaker PSM Score
{psm['PSPSMScore']}
PeptideShaker PSM Confidence
{psm['PSPSMConfidence']}
Pass Threshold
" + if psm['PassThreshold'].upper() == 'TRUE': + r_val += f"
{psm['PassThreshold'].upper()}
" + else: + r_val += f"
{psm['PassThreshold'].upper()}
" + r_val += "
##PSM##" + + return r_val + + +def score_psms(db_path, sequence_file, ion_types=('b', 'b-H2O', 'b-NH3','y', 'y-H2O', 'y-NH3', 'M'), epsilon=0.5, maxcharge=1): + # region + """ + Scores PSMs found on mzsqlite db by comparing observed mz peaks against + ion fragmentation of target peptide sequences. + + Parameters + ---------- + db_path: string + Full path to the mzSQLite database + sequence_file: string + Full path to tabular file of target peptide sequences. + One sequence to a line. + ion_types: tuple of fragment types to be used. Defaults to ('b', 'y') + epsilon: float, acceptable difference when comparing fragments to scan mz + + Returns + ------- + An HTML report. + + """ + # endregion + + #Read sequence file for target sequences + target_sequences = [] + with open(sequence_file, "r") as f: + for line in f: + if (len(line.strip()) > 0): + target_sequences.append(line.strip()) + found_peptides = _peptides(db_path, target_sequences) + + html_code = this.html_slug + #get all psm entries for each target sequence and work with each set of psms + for pep_seq in found_peptides.keys(): + pep_html = f"

{pep_seq}

##PSM##" + encoded_sequence = None + + logger.info(f"Processing PSMs associated with {pep_seq}") + + if (found_peptides.get(pep_seq)[2] is not None): + encoded_sequence = found_peptides.get(pep_seq)[2] + + target_psms = _psm(db_path, pep_seq) + for tp in target_psms: + pep_html = pep_html.replace("##PSM##", _write_psm_details(tp)) + scan = _scans(db_path, tp['spectrumID']) + if (len(scan['mzValues']) > 0): + matched_peaks = dict() + frags = dict() + for ion_type in ion_types: + frags[ion_type] = _fragment_ions(pep_seq, itype=ion_type, encodedsequence=encoded_sequence) + matched_peaks[ion_type] = _match_mzs(_to_float(scan['mzValues']), frags[ion_type], delta=epsilon) + + plot_html = _generate_ion_plot( + pep_seq, + {"mzvalues": _to_float(scan['mzValues']), "intensity": _to_float(scan["intensities"])}, + frags, matched_peaks) + + pep_html = pep_html.replace("##PSM##", f"
{pep_seq}_{tp['spectrumID']}_{tp['spectrumTitle']}.svg

##FS##

##PSM##") + frag_table = _create_table(pep_seq, matched_peaks, frags, maxcharge, encodedsequence=encoded_sequence) + pep_html = pep_html.replace("##FS##", frag_table) + pep_html = pep_html.replace(f"{pep_seq}_{tp['spectrumID']}_{tp['spectrumTitle']}.svg", plot_html) + else: + pep_html = pep_html.replace("##PSM##", "

No scan mz/intensity values are available for this PSM

##PSM##") + #Populate report + pep_html += """##PEPTIDE##""" + html_code = html_code.replace("##PEPTIDE##", pep_html) + html_code = html_code.replace("##PEP_NAVIGATION##", f"{pep_seq}##PEP_NAVIGATION##") + html_code = html_code.replace("##PEP_NAVIGATION##", "") + html_code = html_code.replace("##PEPTIDE##", "") + html_code = html_code.replace("##PSM##", "") + html_code = html_code.replace("##RUNDATE##", str(datetime.datetime.now())) + write_report(html_code) + +if __name__ == "__main__": + score_psms(sys.argv[1], sys.argv[2]) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..15936b5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +appdirs==1.4.4 +certifi==2020.6.20 +cycler==0.10.0 +jedi==0.17.2 +kiwisolver==1.2.0 +lxml==4.5.2 +numpy==1.19.2 +parso==0.7.1 +Pillow==7.2.0 +plotly==4.10.0 +prompt-toolkit==3.0.7 +ptpython==3.0.5 +Pygments==2.7.0 +pyparsing==2.4.7 +pyteomics==4.3.2 +python-dateutil==2.8.1 +retrying==1.3.3 +six==1.15.0 +wcwidth==0.2.5 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..7e379c3 --- /dev/null +++ b/setup.py @@ -0,0 +1,28 @@ +import setuptools + +_pkg_version = "1.0.0" +_author = "Thomas McGowan" +_author_email = "mcgo0092@umn.edu" +_license = "MIT" +_repo = "https://github.umn.edu/mcgo0092/psm_fragments.git" + + +setuptools.setup( + name="psm_fragments", + version=f"{_pkg_version}", + packages=["psmfragmentation"], + install_requires=["pyteomics", "plotly"], + description="Score PSMs with fragmentation quality check", + author=f"{_author}", + author_email=f"{_author_email}", + url="f{_repo}", + download_url=f"{_repo}/archive/v_{_pkg_version}.tar.gz", + license=f"{_license}", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + ], + python_requires=">=3.6", +) diff --git a/tests/data/peptide_sequences.txt b/tests/data/peptide_sequences.txt new file mode 100644 index 0000000..9c8bc42 --- /dev/null +++ b/tests/data/peptide_sequences.txt @@ -0,0 +1,9 @@ +YLGTGPEAGLPYGANK +MAGNGGDAALALLLLDR +RGPEQTQGNFGDQELIR +KQQTVTLLPAADLDDFSK +IGMEVTPSGTWLTYTGAIK +GQGVPINTNSSPDDQIGYYRR +RPQGLPNNTASWFTALTQHGK +MAGNGGDAALALLLLDRLNQLESK +RPQGLPNNTASWFTALTQHGKEDLKFPR