diff --git a/.github/workflows/databases-ci.yaml b/.github/workflows/databases-ci.yaml index 3a544c2..8636e99 100644 --- a/.github/workflows/databases-ci.yaml +++ b/.github/workflows/databases-ci.yaml @@ -26,7 +26,7 @@ jobs: strategy: matrix: os: [macOS-latest, ubuntu-latest, windows-latest] - python-version: [3.9, "3.10", "3.11"] + python-version: ["3.10", "3.11"] steps: - uses: actions/checkout@v3 diff --git a/data/3. PKIS Nanosyn Assay Heatmaps.xlsx b/data/3. PKIS Nanosyn Assay Heatmaps.xlsx new file mode 100644 index 0000000..44bc24b Binary files /dev/null and b/data/3. PKIS Nanosyn Assay Heatmaps.xlsx differ diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 9d93805..eba3192 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -21,3 +21,5 @@ dependencies: - beautifulsoup4 - numpy - biopython + - pydantic + - GitPython diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/aligners.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/aligners.py index f480e31..ed4323c 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/aligners.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/aligners.py @@ -1,20 +1,67 @@ +from abc import ABC, abstractmethod from dataclasses import dataclass -from Bio import Align + +class CustomAligner(ABC): + """Custom aligner class for aligning sequences.""" + + substitution_matrix: str = "BLOSUM62" + """str: Substitution matrix used. Default is BLOSUM62.""" + + @abstractmethod + def align(self, *args, **kwargs): + """Abstract method for aligning sequences.""" + ... + + +@dataclass +class ClustalOmegaAligner(CustomAligner): + """ClustalOmega aligner class for multiple sequence alignments (need to initialize with list of sequences).""" + + list_sequences: list[str] + """list[str]: List of sequences to align.""" + path_bin: str = "/usr/local/bin/clustalo" + """str: Path to clustalo binary. Default is "/usr/local/bin/clustalo".""" + + def __post_init__(self): + from biotite.sequence import ProteinSequence, align + + self.alphabet = ProteinSequence.alphabet + self.matrix_substitution = align.SubstitutionMatrix( + self.alphabet, self.alphabet, self.substitution_matrix + ) + self.list_sequences = [ProteinSequence(seq) for seq in self.list_sequences] + self.align() + + def align(self) -> str: + from biotite.application import clustalo + + app = clustalo.ClustalOmegaApp( + self.list_sequences, self.path_bin, self.matrix_substitution + ) + + app.start() + app.join() + self.alignments = app.get_alignment() + self.list_alignments = self.alignments.get_gapped_sequences() @dataclass -class CustomAligner: +class BioAligner(CustomAligner): + """BioPython aligner class for aligning sequences. Initialized without sequences""" + + from Bio import Align + mode: str = "local" """str: Alignment mode. Default is "local".""" - substitution_matrix: str = "BLOSUM62" - """str: Substitution matrix. Default is BLOSUM62.""" gap_score: int = -5 """int: Gap score. Default is -5.""" extend_gap_score: int = -1 """int: Gap extension score. Default is -1.""" def __post_init__(self): + from Bio import Align + self.aligner = Align.PairwiseAligner() self.aligner.mode = self.mode self.aligner.substitution_matrix = Align.substitution_matrices.load( @@ -28,7 +75,7 @@ def align(self, seq1: str, seq2: str) -> Align.MultipleSeqAlignment: @dataclass -class BL2UniProtAligner(CustomAligner): +class BL2UniProtAligner(BioAligner): mode: str = "global" """str: Alignment mode. Default is "global.""" @@ -37,7 +84,7 @@ def __post_init__(self): @dataclass -class Kincore2UniProtAligner(CustomAligner): +class Kincore2UniProtAligner(BioAligner): mode: str = "local" """str: Alignment mode. Default is "local.""" diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/cbioportal.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/cbioportal.py index 55bb03b..701d2bc 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/cbioportal.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/cbioportal.py @@ -166,3 +166,10 @@ def get_mutations(self): # TODO: implement clinical annotations class + + +def try_except_middle_int(str_in): + try: + return int(str_in[1:-1]) + except ValueError: + return None diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py index c28ec93..910469a 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/kinase_schema.py @@ -1,12 +1,14 @@ import logging import os from enum import Enum, StrEnum +from itertools import chain import pandas as pd from pydantic import BaseModel, ValidationError, constr, model_validator from typing_extensions import Self from missense_kinase_toolkit.databases import klifs +from missense_kinase_toolkit.databases.aligners import ClustalOmegaAligner from missense_kinase_toolkit.databases.kincore import ( align_kincore2uniprot, extract_pk_fasta_info_as_dict, @@ -85,10 +87,9 @@ class Family(Enum): KinaseDomainName = StrEnum( "KinaseDomainName", {"KD" + str(idx + 1): kd for idx, kd in enumerate(LIST_PFAM_KD)} ) - -UniProtSeq = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWXY]+$") +SeqUniProt = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWXY]+$") """Pydantic model for UniProt sequence constraints.""" -KLIFSPocket = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWY\-]{85}$") +SeqKLIFS = constr(pattern=r"^[ACDEFGHIKLMNPQRSTVWY\-]{85}$") """Pydantic model for KLIFS pocket sequence constraints.""" UniProtID = constr(pattern=r"^[A-Z][0-9][A-Z0-9]{3}[0-9]$") """Pydantic model for UniProt ID constraints.""" @@ -107,7 +108,7 @@ class KinHub(BaseModel): class UniProt(BaseModel): """Pydantic model for UniProt information.""" - canonical_seq: UniProtSeq + canonical_seq: SeqUniProt class KLIFS(BaseModel): @@ -120,7 +121,7 @@ class KLIFS(BaseModel): family: Family iuphar: int kinase_id: int - pocket_seq: KLIFSPocket | None + pocket_seq: SeqKLIFS | None class Pfam(BaseModel): @@ -137,7 +138,7 @@ class Pfam(BaseModel): class KinCore(BaseModel): """Pydantic model for KinCore information.""" - seq: UniProtSeq + seq: SeqUniProt start: int | None end: int | None mismatch: list[int] | None @@ -154,7 +155,8 @@ class KinaseInfo(BaseModel): Pfam: Pfam | None KinCore: KinCore | None bool_offset: bool = True - KLIFS2UniProt: dict[str, int] | None = None + KLIFS2UniProtIdx: dict[str, int | None] | None = None + KLIFS2UniProtSeq: dict[str, str | None] | None = None # https://docs.pydantic.dev/latest/examples/custom_validators/#validating-nested-model-fields @model_validator(mode="after") @@ -174,17 +176,14 @@ def change_wrong_klifs_pocket_seq(self) -> Self: # https://stackoverflow.com/questions/68082983/validating-a-nested-model-in-pydantic # skip if other validation errors occur in nested models first @model_validator(mode="after") - @classmethod - def validate_uniprot_length(cls, values): + def validate_uniprot_length(self) -> Self: """Validate canonical UniProt sequence length matches Pfam length if Pfam not None.""" - pfam = values.Pfam - uniprot = values.UniProt - if pfam is not None: - if len(uniprot.canonical_seq) != pfam.protein_length: + if self.Pfam is not None: + if len(self.UniProt.canonical_seq) != self.Pfam.protein_length: raise ValidationError( "UniProt sequence length does not match Pfam protein length." ) - return values + return self @model_validator(mode="after") def generate_klifs2uniprot_dict(self) -> Self: @@ -204,9 +203,8 @@ def generate_klifs2uniprot_dict(self) -> Self: ) if temp_obj.list_align is not None: - self.KLIFS2UniProt = dict( - zip(klifs.LIST_KLIFS_REGION, temp_obj.list_align) - ) + self.KLIFS2UniProtIdx = temp_obj.KLIFS2UniProtIdx + self.KLIFS2UniProtSeq = temp_obj.KLIFS2UniProtSeq return self @@ -528,6 +526,111 @@ def create_kinase_models_from_df( return dict_kinase_models +def get_sequence_max_with_exception(list_in: list[int | None]) -> int: + """Get maximum sequence length from dictionary of dictionaries. + + Parameters + ---------- + dict_in : dict[str, dict[str, str | None]] + Dictionary of dictionaries. + + Returns + ------- + int + Maximum sequence length. + """ + try: + return max(list_in) + except ValueError: + return 0 + + +def replace_none_with_max_len(dict_in): + dict_max_len = { + key1: get_sequence_max_with_exception( + [len(val2) for val2 in val1.values() if val2 is not None] + ) + for key1, val1 in dict_in.items() + } + + for region, length in dict_max_len.items(): + for hgnc, seq in dict_in[region].items(): + if seq is None: + dict_in[region][hgnc] = "-" * length + + return dict_in + + +def align_inter_intra_region( + dict_in: dict[str, KinaseInfo], +) -> dict[str, dict[str, str]]: + """Align inter and intra region sequences. + + Parameters + ---------- + dict_in : dict[str, KinaseInfo] + Dictionary of kinase information models + + Returns + ------- + dict[str, dict[str, str]] + Dictionary of aligned inter and intra region + """ + + list_inter_intra = klifs.LIST_INTER_REGIONS + klifs.LIST_INTRA_REGIONS + + dict_align = { + region: {hgnc: None for hgnc in dict_in.keys()} for region in list_inter_intra + } + + for region in list_inter_intra: + list_hgnc, list_seq = [], [] + for hgnc, kinase_info in dict_in.items(): + try: + seq = kinase_info.KLIFS2UniProtSeq[region] + except TypeError: + seq = None + if seq is not None: + list_hgnc.append(hgnc) + list_seq.append(seq) + if len(list_seq) > 2: + aligner_temp = ClustalOmegaAligner(list_seq) + dict_align[region].update( + dict(zip(list_hgnc, aligner_temp.list_alignments)) + ) + else: + # hinge:linker - {'ATR': 'N', 'CAMKK1': 'L'} + # αE:VI - {'MKNK1': 'DKVSLCHLGWSAMAPSGLTAAPTSLGSSDPPTSASQVAGTT'} + dict_align[region].update(dict(zip(list_hgnc, list_seq))) + + replace_none_with_max_len(dict_align) + + return dict_align + + +def reverse_order_dict_of_dict( + dict_in: dict[str, dict[str, str | int | None]], +) -> dict[str, dict[str, str | int | None]]: + """Reverse order of dictionary of dictionaries. + + Parameters + ---------- + dict_in : dict[str, dict[str, str | int | None]] + Dictionary of dictionaries + + Returns + ------- + dict_out : dict[str, dict[str, str | int | None]] + Dictionary of dictionaries with reversed order + + """ + dict_out = { + key1: {key2: dict_in[key2][key1] for key2 in dict_in.keys()} + for key1 in set(chain(*[list(j.keys()) for j in dict_in.values()])) + } + return dict_out + + # # NOT IN USE - USE TO GENERATE ABOVE # import numpy as np @@ -554,7 +657,7 @@ def create_kinase_models_from_df( # df_pivot = pd.DataFrame(df_kinhub[["Family", "SubFamily"]].value_counts()).reset_index().pivot(columns="Family", index="SubFamily", values="count") # df_pivot.loc[df_pivot.index.isin([key for key, val in dict_subfamily.items() if val >= 5]),].dropna(axis=1, how="all") -# # kinase_schema.UniProtSeq +# # kinase_schema.SeqUniProt # "".join(sorted(list(set(chain.from_iterable(df_uniprot["canonical_sequence"].apply(lambda x: list(x)).tolist()))))) # # kinase_schema.KLIFSPocket @@ -594,3 +697,51 @@ def create_kinase_models_from_df( # .apply(lambda x: "".join(x) == "domain"), "name"] # .tolist() # ) +# +# USED FOR INTER MAPPING ASSESSMENT +# dict_kinase = create_kinase_models_from_df() + +# dict_klifs = {i: j for i, j in dict_kinase.items() if \ +# (j.KLIFS is not None and j.KLIFS.pocket_seq is not None)} +# df_klifs_idx = pd.DataFrame([list(j for i, j in val.KLIFS2UniProt.items()) for key, val in dict_klifs.items()], +# columns=klifs.LIST_KLIFS_REGION, index=dict_klifs.keys()) + +# list_region = list(klifs.DICT_POCKET_KLIFS_REGIONS.keys()) + +# dict_start_end = {list_region[i-1]:list_region[i] for i in range(1, len(list_region)-1)} +# dict_cols = {key: list(i for i in df_klifs_idx.columns.tolist() \ +# if i.split(":")[0] == key) for key in list_region} + +# list_inter = [] +# for key, val in dict_start_end.items(): + +# list_temp = [] +# for idx, row in df_klifs_idx.iterrows(): + +# cols_start, cols_end = dict_cols[key], dict_cols[val] + +# start = row.loc[cols_start].values +# if np.all(np.isnan(start)): +# max_start = None +# else: +# max_start = np.nanmax(start) + 1 + +# end = row.loc[cols_end].values +# if np.all(np.isnan(end)): +# min_end = None +# else: +# min_end = np.nanmin(end) + +# list_temp.append((max_start, min_end)) + +# list_inter.append(list_temp) + +# df_inter = pd.DataFrame(list_inter, +# index=[f"{key}:{val}" for key, val in dict_start_end.items()], +# columns=df_klifs_idx.index).T +# df_length = df_inter.map(lambda x: try_except_substraction(x[0], x[1])) + +# df_multi = df_length.loc[:, df_length.apply(lambda x: any(x > 0))] +# # BUB1B has 1 residue in b.l intra region that was +# # previously captured in αC:b.l since flanked by None +# list_cols = [i for i in df_multi.columns if i != "αC:b.l"] diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py index b455bdc..37d9c78 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/klifs.py @@ -148,6 +148,26 @@ """list[str]: List of string of all KLIFS pocket regions in format region:idx.""" +# αC:b.l - {'BUB1B': 'E'} - need to skip this, seems to be in b.l gap region +LIST_INTER_REGIONS = [ + "II:III", + "III:αC", + "IV:V", + "hinge:linker", + "αD:αE", + "αE:VI", + "VII:VIII", +] +"""list[str]: List of inter-region region gaps that exist given analysis.""" + + +LIST_INTRA_REGIONS = [ + "b.l_intra", + "linker_intra", +] +"""list[str]: List of intra-region region gaps that exist given analysis.""" + + class KLIFS(SwaggerAPIClient): """Class to interact with the KLIFS API.""" @@ -340,10 +360,15 @@ class KLIFSPocket: list_klifs_substr_match: list[str | None] = field(default_factory=list) list_substring_idxs: list[list[int | None] | None] = field(default_factory=list) list_align: list[str | None] | None = None + KLIFS2UniProtIdx: dict[str, int | None] = field(default_factory=dict) + KLIFS2UniProtSeq: dict[str, str | None] = field(default_factory=dict) def __post_init__(self): self.iterate_klifs_alignment() self.generate_alignment_list(bool_offset=self.offset_bool) + if self.list_align is not None: + self.KLIFS2UniProtIdx.update(dict(zip(LIST_KLIFS_REGION, self.list_align))) + self.generate_alignment_list_including_gaps() @staticmethod def remove_gaps_from_klifs(klifs_string: str) -> str: @@ -445,7 +470,7 @@ def select_correct_alignment( ] if bool_bl: - # manual review showed 2 matches + gap + 5 matches + # manual review showed 2 matches + gap + 5 matches in b.l region list_idx = [ idx for idx, i in enumerate(list_alignments) @@ -453,7 +478,7 @@ def select_correct_alignment( ] region = "b.l" else: - # manual review showed 1 matches + gap + 3 matches + # manual review showed 1 matches + gap + 3 matches in linker region list_idx = [ idx for idx, i in enumerate(list_alignments) @@ -779,3 +804,282 @@ def generate_alignment_list(self, bool_offset: bool) -> list[str | None]: return None self.list_align = list_align + + def get_inter_region(self): + """Get inter-region sequences.""" + + list_region = list(DICT_POCKET_KLIFS_REGIONS.keys()) + dict_start_end = { + list_region[i - 1]: list_region[i] for i in range(1, len(list_region) - 1) + } + dict_cols = { + key: list(i for i in LIST_KLIFS_REGION if i.split(":")[0] == key) + for key in list_region + } + + list_inter = [] + for key1, val1 in dict_start_end.items(): + keys_start, keys_end = dict_cols[key1], dict_cols[val1] + + start = [ + val for key, val in self.KLIFS2UniProtIdx.items() if key in keys_start + ] + if all(v is None for v in start): + max_start = None + else: + max_start = np.nanmax(np.array(start, dtype=float)) + 1 + + end = [val for key, val in self.KLIFS2UniProtIdx.items() if key in keys_end] + if all(v is None for v in end): + min_end = None + else: + min_end = np.nanmin(np.array(end, dtype=float)) + + list_inter.append((max_start, min_end)) + + dict_inter = dict( + zip([f"{key}:{val}" for key, val in dict_start_end.items()], list_inter) + ) + + dict_fasta = {i: {} for i in LIST_INTER_REGIONS} + for region in LIST_INTER_REGIONS: + start, end = dict_inter[region][0], dict_inter[region][1] + if start is not None and end is not None: + if end - start == 0: + dict_fasta[region] = None + else: + dict_fasta[region] = self.uniprotSeq[int(start) - 1 : int(end) - 1] + else: + dict_fasta[region] = None + + return dict_fasta + + def recursive_idx_search( + self, + idx: int, + in_dict: dict[str, int], + decreasing: bool, + ): + """Recursively search for index in dictionary. + + Parameters + ---------- + idx : int + Index to start search + in_dict : dict[str, int] + Dictionary to search + decreasing : bool + If True, search in decreasing order; if False, search in increasing order + + Returns + ------- + idx : int + Index in dictionary + + """ + if idx == 0: + return "NONE" + list_keys = list(in_dict.keys()) + if in_dict[list_keys[idx]] is None: + if decreasing: + idx = self.recursive_idx_search(idx - 1, in_dict, True) + else: + idx = self.recursive_idx_search(idx + 1, in_dict, False) + return idx + + def find_intra_gaps( + self, + dict_in: dict[str, int], + bool_bl: bool = True, + ) -> tuple[int, str] | None: + """Find intra-pocket gaps in KLIFS pocket region. + + Parameters + ---------- + dict_in : dict[str, int] + Dictionary of KLIFS regions and their corresponding indices + bool_bl : bool + If True, find intra-region gaps for b.l region; if False, find intra-region gaps for linker region + + Returns + ------- + tuple[str, str] | None + Tuple of intra-region gaps + + """ + if bool_bl: + region, idx_in, idx_out = "b.l", 1, 2 + region, idx_in, idx_out = "b.l", 1, 2 + else: + region, idx_in, idx_out = "linker", 0, 1 + + list_keys = list(dict_in.keys()) + list_idx = [idx for idx, i in enumerate(dict_in.keys()) if region in i] + + # TODO: ATR and CAMKK1 have inter hinge:linker region + start = list_idx[idx_in] + end = list_idx[idx_out] + + if dict_in[list_keys[start]] is None: + start = self.recursive_idx_search(start - 1, dict_in, True) + if dict_in[list_keys[end]] is None: + end = self.recursive_idx_search(end + 1, dict_in, False) + + # STK40 has no b.l region or preceding + if start == "NONE": + return None + + return (dict_in[list_keys[start]], dict_in[list_keys[end]]) + + def return_intra_gap_substr(self, bl_bool) -> str | None: + """Return intra-region gap substring. + + Parameters + ---------- + bl_bool : bool + If True, find intra-region gaps for b.l region; if False, find intra-region gaps for linker region + + Returns + ------- + str | None + Intra-region gap substring + + """ + tuple_idx = self.find_intra_gaps(self.KLIFS2UniProtIdx, bl_bool) + if tuple_idx is None: + return None + else: + start, end = tuple_idx[0], tuple_idx[1] + if end - start == 1: + return None + else: + return self.uniprotSeq[start : end - 1] + + def get_intra_region(self): + """Get intra-region sequences.""" + list_seq = [] + for region in LIST_INTRA_REGIONS: + if region.split("_")[0] == "b.l": + list_seq.append(self.return_intra_gap_substr(True)) + else: + list_seq.append(self.return_intra_gap_substr(False)) + return dict(zip(LIST_INTRA_REGIONS, list_seq)) + + def generate_alignment_list_including_gaps(self): + """Return fully aligned KLIFS pocket.""" + list_region = list(DICT_POCKET_KLIFS_REGIONS.keys()) + + # inter region + dict_inter = self.get_inter_region() + + list_inter_regions = list(dict_inter.keys()) + list_idx_inter = list( + chain( + *[ + list( + idx for idx, j in enumerate(list_region) if j == i.split(":")[0] + ) + for i in list_inter_regions + ] + ) + ) + + list_region_combo = list(list_region) + i = 0 + for idx, val in zip(list_idx_inter, list_inter_regions): + list_region_combo.insert(idx + i + 1, val) + i += 1 + + # intra region + dict_intra = self.get_intra_region() + + idx = list_region_combo.index("b.l") + list_region_combo[idx : idx + 1] = "b.l_1", "b.l_intra", "b.l_2" + + idx = list_region_combo.index("linker") + list_region_combo[idx : idx + 1] = "linker_1", "linker_intra", "linker_2" + + dict_full_klifs_region = {region: None for region in list_region_combo} + + dict_actual = dict(zip(list_region, self.list_klifs_substr_actual)) + # for region in list_region_combo:KL + for region, seq in dict_actual.items(): + if region == "b.l": + dict_full_klifs_region["b.l_1"] = seq[0:2] + dict_full_klifs_region["b.l_2"] = seq[2:] + pass + elif region == "linker": + dict_full_klifs_region["linker_1"] = seq[0:1] + dict_full_klifs_region["linker_2"] = seq[1:] + else: + dict_full_klifs_region[region] = seq + + for region, seq in dict_inter.items(): + dict_full_klifs_region[region] = seq + + for region, seq in dict_intra.items(): + dict_full_klifs_region[region] = seq + + self.KLIFS2UniProtSeq = dict_full_klifs_region + + +# # NOT IN USE - USE TO GENERATE ABOVE + +# list_multi = [list(val.KLIFSPocket.list_klifs_region[idx] for idx, entry \ +# in enumerate(val.KLIFSPocket.list_substring_idxs) if entry is not None and len(entry)>1)\ +# for key, val in dict_klifs.items()] +# set(chain(*list_multi)) # {'b.l:b.l', 'linker:linker'} + +# df_klifs_idx = pd.DataFrame([list(j for i, j in val.KLIFS2UniProt.items()) for key, val in dict_klifs.items()], +# columns=klifs.LIST_KLIFS_REGION, index=dict_klifs.keys()) + +# # dict_temp = klifs.DICT_POCKET_KLIFS_REGIONS +# list_region = list(klifs.DICT_POCKET_KLIFS_REGIONS.keys()) + +# dict_start_end = {list_region[i-1]:list_region[i] for i in range(1, len(list_region)-1)} +# dict_cols = {key: list(i for i in df_klifs_idx.columns.tolist() \ +# if i.split(":")[0] == key) for key in list_region} + +# list_inter = [] +# for key, val in dict_start_end.items(): + +# list_temp = [] +# for idx, row in df_klifs_idx.iterrows(): + +# cols_start, cols_end = dict_cols[key], dict_cols[val] + +# start = row.loc[cols_start].values +# if np.all(np.isnan(start)): +# max_start = None +# else: +# max_start = np.nanmax(start) + 1 + +# end = row.loc[cols_end].values +# if np.all(np.isnan(end)): +# min_end = None +# else: +# min_end = np.nanmin(end) + +# list_temp.append((max_start, min_end)) + +# list_inter.append(list_temp) + +# df_inter = pd.DataFrame(list_inter, +# index=[f"{key}:{val}" for key, val in dict_start_end.items()], +# columns=df_klifs_idx.index).T +# df_length = df_inter.map(lambda x: try_except_substraction(x[0], x[1])) + +# # df_one = df_length.loc[:, df_length.apply(lambda x: any(x == 1))] +# # df_multi = df_length.loc[:, df_length.apply(lambda x: any(x > 1))] +# df_multi = df_length.loc[:, df_length.apply(lambda x: any(x > 0))] +# # αC:b.l - {'BUB1B': 'E'} - need to skip this, seems to be in b.l gap region + +# df_bl = pd.DataFrame([list(j for i, j in val.KLIFS2UniProt.items() if i in list_idx_dict[0]) \ +# for key, val in dict_klifs.items()], +# columns=list_idx_dict[0], index=dict_klifs.keys()) +# df_bl[df_bl.isnull().any(axis=1)] + +# df_linker = pd.DataFrame([list(j for i, j in val.KLIFS2UniProt.items() if i in list_idx_dict[1]) \ +# for key, val in dict_klifs.items()], +# columns=list_idx_dict[1], index=dict_klifs.keys()) +# df_linker[df_linker.isnull().any(axis=1)] diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/plot.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/plot.py new file mode 100644 index 0000000..29c7563 --- /dev/null +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/plot.py @@ -0,0 +1,158 @@ +import numpy as np +from bokeh.layouts import gridplot + +# from bokeh.models import ColumnDataSource, Plot, Grid, Range1d +from bokeh.models import ColumnDataSource, Range1d +from bokeh.models.glyphs import Rect, Text +from bokeh.plotting import figure +from pydantic.dataclasses import dataclass + + +@dataclass +class SequenceAlignment: + list_sequences: list[str] + """List of sequences to show in aligner.""" + list_ids: list[str] + """List of sequence IDs.""" + dict_colors: dict[str, str] + """Dictionary of colors for each sequence.""" + font_size: int = 9 + """Font size for alignment.""" + plot_width: int = 800 + """Width of the plot.""" + + def __post_init__(self): + self.generate_alignment() + + @staticmethod + def get_colors( + list_str: str, + dict_colors: dict[str, str], + ) -> list[str]: + """Get colors for residue in a given sequence. + + Parameters + ---------- + list_str : str + List of residues in a sequence. + dict_colors : dict[str, str] + Dictionary of colors for each residue. + + Returns + ------- + list[str] + List of colors for each residue. + """ + list_colors = [dict_colors[i] for i in list_str] + return list_colors + + def generate_alignment(self) -> None: + """Generate sequence alignment plot adapted from https://dmnfarrell.github.io/bioinformatics/bokeh-sequence-aligner.""" + + # reverse text and colors so A-Z is top-bottom not bottom-top + list_text = [i for s in self.list_sequences[::-1] for i in s] + colors = self.get_colors(list_text, self.dict_colors) + + N = len(self.list_sequences[0]) + S = len(self.list_sequences) + + x = np.arange(1, N + 1) + y = np.arange(0, S, 1) + # creates a 2D grid of coords from the 1D arrays + xx, yy = np.meshgrid(x, y) + # flattens the arrays + gx = xx.ravel() + gy = yy.flatten() + # use recty for rect coords with an offset + recty = gy + 0.5 + # now we can create the ColumnDataSource with all the arrays + source = ColumnDataSource( + dict( + x=gx, + y=gy, + recty=recty, + text=list_text, + colors=colors, + ) + ) + x_range = Range1d(0, N + 1, bounds="auto") + if N > 100: + viewlen = 100 + else: + viewlen = N + + # entire sequence view (no text, with zoom) + p = figure( + title=None, + frame_width=self.plot_width, + frame_height=50, + x_range=x_range, + y_range=(0, S), + tools="xpan, xwheel_zoom, reset, save", + min_border=0, + toolbar_location="below", + ) + rects = Rect( + x="x", + y="recty", + width=1, + height=1, + fill_color="colors", + line_color=None, + fill_alpha=0.6, + ) + p.add_glyph(source, rects) + p.yaxis.visible = False + p.grid.visible = False + + # sequence text view with ability to scroll along x axis + # view_range is for the close up view + view_range = (0, viewlen) + plot_height = S * 15 + 50 + p1 = figure( + title=None, + frame_width=self.plot_width, + frame_height=plot_height, + x_range=view_range, + y_range=self.list_ids[::-1], + tools="xpan,reset", + min_border=0, + toolbar_location="below", + ) + glyph = Text( + x="x", + y="y", + text="text", + text_align="center", + text_color="black", + # text_font = "monospace", + text_font_size=f"{str(self.font_size)}pt", + ) + rects = Rect( + x="x", + y="recty", + width=1, + height=1, + fill_color="colors", + line_color=None, + fill_alpha=0.4, + ) + p1.add_glyph(source, glyph) + p1.add_glyph(source, rects) + p1.grid.visible = False + p1.xaxis.major_label_text_font_style = "bold" + p1.yaxis.minor_tick_line_width = 0 + p1.yaxis.major_tick_line_width = 0 + + self.plot = gridplot([[p], [p1]], toolbar_location="below") + + def show_plot(self) -> None: + """Show sequence alignment plot via Bokeh.""" + from bokeh.plotting import show + + # notebook alternative + # import panel as pn + # pn.extension() + # pn.pane.Bokeh(alignment_klifs_min.plot) + + show(self.plot) diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/protvar.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/protvar.py new file mode 100644 index 0000000..1aa7074 --- /dev/null +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/protvar.py @@ -0,0 +1,66 @@ +import json +import logging +from enum import Enum + +from pydantic.dataclasses import dataclass + +from missense_kinase_toolkit.databases import requests_wrapper +from missense_kinase_toolkit.databases.api_schema import RESTAPIClient + +logger = logging.getLogger(__name__) + + +class ScoreDatabase(str, Enum): + """Enum class to define the score database.""" + + Conservation = "CONSERV" + EVE = "EVE" + ESM1b = "ESM" + AlphaMissense = "AM" + + +@dataclass +class ProtvarScore(RESTAPIClient): + """Class to interact with Protvar API.""" + + database: ScoreDatabase + """Database to query for score: Conservation (CONSERV), EVE (EVE), ESM1b (ESM) and AlphaMissense (AM) scores.""" + uniprot_id: str + """Uniprot ID.""" + pos: int + """Position in the protein where mutation resides.""" + mut: str | None = None + """Mutant residue (1 or 3 letter code); disregarded for Conservation score and optional for the other scores; + if None will provide all .""" + + def __post_init__(self): + self.url = "https://www.ebi.ac.uk/ProtVar/api/score//?mt=&name=" + self.create_query_url() + self.query_api() + + def create_query_url(self): + """Create URL for Protvar score API query.""" + + if self.mut is None: + mut_old = "mt=&" + mut_new = "" + else: + mut_old = "" + mut_new = self.mut + + self.url_query = ( + self.url.replace("", self.uniprot_id) + .replace("", str(self.pos)) + .replace(mut_old, mut_new) + .replace("", self.database) + ) + + def query_api(self) -> dict: + header = {"Accept": "application/json"} + res = requests_wrapper.get_cached_session().get(self.url_query, headers=header) + + if res.ok: + self._protvar_score = json.loads(res.text) + else: + print(f"Error: {res.status_code}") + self._protvar_scores = None diff --git a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/utils.py b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/utils.py index 97a6072..b605178 100644 --- a/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/utils.py +++ b/missense_kinase_toolkit/databases/missense_kinase_toolkit/databases/utils.py @@ -183,3 +183,10 @@ def get_repo_root(): return repo.working_tree_dir except git.InvalidGitRepositoryError: return None + + +def try_except_substraction(a, b): + try: + return b - a + except TypeError: + return None diff --git a/missense_kinase_toolkit/databases/poetry.lock b/missense_kinase_toolkit/databases/poetry.lock index ee60261..16c901e 100644 --- a/missense_kinase_toolkit/databases/poetry.lock +++ b/missense_kinase_toolkit/databases/poetry.lock @@ -265,6 +265,68 @@ files = [ [package.dependencies] numpy = "*" +[[package]] +name = "biotite" +version = "1.0.1" +description = "A comprehensive library for computational molecular biology" +optional = false +python-versions = ">=3.10" +files = [ + {file = "biotite-1.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b1e64bde3c21c7140318c4a917f350dd569211edb84265f9b4c4d6f06a8e861e"}, + {file = "biotite-1.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7ccd34d524318cd1f5991b795c8c7c1cabc3f95cce8d68d64bb27071b18168f7"}, + {file = "biotite-1.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7d6fef30a95c99951eb7d15dd7dd75123118ce4a1aa4becec8c6cd52462775b8"}, + {file = "biotite-1.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:9f2edcc80672849f751c6efae6a4a9fe711300bdc523aa057ec1f11e6636135c"}, + {file = "biotite-1.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ebc28860990c8be2a41b94c25b1956ee89e034954cdeff2e6731b55608e16358"}, + {file = "biotite-1.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7a0babf7d8f15c39c905e1b61afb05569262b90f323f7f90ca6f0ae2bf5c0759"}, + {file = "biotite-1.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2c6facb30d8ac348a5f816c9c043a3a7986e8849f11a6de4eec9938cbe9f3c4"}, + {file = "biotite-1.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:abf66c2bc3395ef629a1e2675e29ac2e76dce892a6b6e1c54b5df0ad9d83631e"}, + {file = "biotite-1.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a00d3f5ddb870d2b5231392712edc523f7a08c86048d8710d8583996b7b66579"}, + {file = "biotite-1.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8574396ec38372d91aeda690e7b683c7214fcda1e5ecbb645922a628794750fb"}, + {file = "biotite-1.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e112e515adb7f7eebb91dd9fb59fac354561c16079061610eb0a86605cb949d"}, + {file = "biotite-1.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:3cc6e834eee0e6a9b19fbf8caa1316ff432b613587fbadb8ea60e34fe80b8fc5"}, + {file = "biotite-1.0.1.tar.gz", hash = "sha256:7012158431fd488c26d78d33032550eea1d7af7afd01b48549a7fd239f63dab5"}, +] + +[package.dependencies] +biotraj = ">=1.0,<2.0" +msgpack = ">=0.5.6" +networkx = ">=2.0" +numpy = ">=1.25" +requests = ">=2.12" + +[package.extras] +test = ["pytest", "pytest-codspeed"] + +[[package]] +name = "biotraj" +version = "1.2.1" +description = "Basic trajectory file format functionality for Biotite; forked from MDTraj" +optional = false +python-versions = ">=3.10" +files = [ + {file = "biotraj-1.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3b121539736477bae20b66d7745be7b16986c69524477a4c10c163281e74285e"}, + {file = "biotraj-1.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:7e4e0c768dd5c530b83cb9611445d0e3c8c2a485a6730ab5ff0ecb03e097b4f0"}, + {file = "biotraj-1.2.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4c8c6c01133c276fcd2d381723f901b4ee29ab644f81e7656ea7aa28851bdac7"}, + {file = "biotraj-1.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:4f1960f32532380433c1d85ad07899c4f63922567edb95e66a7c40112a89afde"}, + {file = "biotraj-1.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:faa0b19eb1f28dce1c4e7fdc230a0867fbf8d4504cf77cf8a2f655625d512e97"}, + {file = "biotraj-1.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0fc14eafa9f65b0b010e766dc67eda028d965b2812b0b702cdcc03b101b14e24"}, + {file = "biotraj-1.2.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:05b02737d68ea11953629af05e2e19bf1c4040ccbf60d4a6fb8b6c7140538d52"}, + {file = "biotraj-1.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:2efdc58a3773830fefee3c0f0fcc8a7f44f545305892304ecf2806017a6b2654"}, + {file = "biotraj-1.2.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:73aec4170beeaea92b48f616341be8162ca528610067f4b10af0f87df9de5907"}, + {file = "biotraj-1.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:77eec3e585e6dd095f75f99d93fc3ed01f8d8f8146fce350c5cbd219bba0251f"}, + {file = "biotraj-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b77c5e4eba4f0eb129be7a45f6298f6b3ec54f7f1ed96ffd32d9fea462c5bd8"}, + {file = "biotraj-1.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:b15a58d38dc7cd81e059b0cd5a1832fe8e2b74f533aed8aab6a9039ccddbdec5"}, + {file = "biotraj-1.2.1.tar.gz", hash = "sha256:4d7ad33ad940dbcfb3c2bd228a18f33f88e04657786a9562173b58dc2dd05349"}, +] + +[package.dependencies] +numpy = ">=1.25" +scipy = ">=1.13" + +[package.extras] +lint = ["ruff (==0.6.1)"] +test = ["netcdf4 (>=1.7.1)", "psutil", "pytest"] + [[package]] name = "black" version = "24.8.0" @@ -342,13 +404,13 @@ files = [ [[package]] name = "bokeh" -version = "3.4.3" +version = "3.6.0" description = "Interactive plots and applications in the browser from Python" optional = false -python-versions = ">=3.9" +python-versions = ">=3.10" files = [ - {file = "bokeh-3.4.3-py3-none-any.whl", hash = "sha256:c6f33817f866fc67fbeb5df79cd13a8bb592c05c591f3fd7f4f22b824f7afa01"}, - {file = "bokeh-3.4.3.tar.gz", hash = "sha256:b7c22fb0f7004b04f12e1b7b26ee0269a26737a08ded848fb58f6a34ec1eb155"}, + {file = "bokeh-3.6.0-py3-none-any.whl", hash = "sha256:699e0df76cdfe54b5f574738647bd0ce230fa44fa0fcda5923e1f0f550f83d74"}, + {file = "bokeh-3.6.0.tar.gz", hash = "sha256:0032dc1e76ad097b07626e51584685ff48c65481fbaaad105663b1046165867a"}, ] [package.dependencies] @@ -830,6 +892,17 @@ files = [ {file = "docutils-0.20.1.tar.gz", hash = "sha256:f08a4e276c3a1583a86dce3e34aba3fe04d02bba2dd51ed16106244e8a923e3b"}, ] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.8" +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + [[package]] name = "exceptiongroup" version = "1.2.2" @@ -902,7 +975,6 @@ files = [ [package.dependencies] blinker = ">=1.6.2" click = ">=8.1.3" -importlib-metadata = {version = ">=3.6.0", markers = "python_version < \"3.10\""} itsdangerous = ">=2.1.2" Jinja2 = ">=3.1.2" Werkzeug = ">=3.0.0" @@ -1214,29 +1286,6 @@ files = [ {file = "imagesize-1.4.1.tar.gz", hash = "sha256:69150444affb9cb0d5cc5a92b3676f0b2fb7cd9ae39e947a5e11a36b4497cd4a"}, ] -[[package]] -name = "importlib-metadata" -version = "8.5.0" -description = "Read metadata from Python packages" -optional = false -python-versions = ">=3.8" -files = [ - {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"}, - {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"}, -] - -[package.dependencies] -zipp = ">=3.20" - -[package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -perf = ["ipython"] -test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] -type = ["pytest-mypy"] - [[package]] name = "importlib-resources" version = "6.4.5" @@ -1248,9 +1297,6 @@ files = [ {file = "importlib_resources-6.4.5.tar.gz", hash = "sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065"}, ] -[package.dependencies] -zipp = {version = ">=3.1.0", markers = "python_version < \"3.10\""} - [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] cover = ["pytest-cov"] @@ -1325,7 +1371,6 @@ prompt-toolkit = ">=3.0.41,<3.1.0" pygments = ">=2.4.0" stack-data = "*" traitlets = ">=5" -typing-extensions = {version = "*", markers = "python_version < \"3.10\""} [package.extras] all = ["black", "curio", "docrepr", "exceptiongroup", "ipykernel", "ipyparallel", "ipywidgets", "matplotlib", "matplotlib (!=3.2.0)", "nbconvert", "nbformat", "notebook", "numpy (>=1.22)", "pandas", "pickleshare", "pytest (<7)", "pytest (<7.1)", "pytest-asyncio (<0.22)", "qtconsole", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "stack-data", "testpath", "trio", "typing-extensions"] @@ -1544,7 +1589,6 @@ files = [ ] [package.dependencies] -importlib-metadata = {version = ">=4.8.3", markers = "python_version < \"3.10\""} jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" python-dateutil = ">=2.8.2" pyzmq = ">=23.0" @@ -1636,7 +1680,6 @@ files = [ ] [package.dependencies] -importlib-metadata = {version = ">=4.8.3", markers = "python_version < \"3.10\""} jupyter-server = ">=1.1.2" [[package]] @@ -1708,7 +1751,6 @@ files = [ [package.dependencies] async-lru = ">=1.0.0" httpx = ">=0.25.0" -importlib-metadata = {version = ">=4.8.3", markers = "python_version < \"3.10\""} ipykernel = ">=6.5.0" jinja2 = ">=3.0.3" jupyter-core = "*" @@ -1753,7 +1795,6 @@ files = [ [package.dependencies] babel = ">=2.10" -importlib-metadata = {version = ">=4.8.3", markers = "python_version < \"3.10\""} jinja2 = ">=3.0.3" json5 = ">=0.9.0" jsonschema = ">=4.18.0" @@ -1900,6 +1941,65 @@ files = [ {file = "kiwisolver-1.4.7.tar.gz", hash = "sha256:9893ff81bd7107f7b685d3017cc6583daadb4fc26e4a888350df530e41980a60"}, ] +[[package]] +name = "linkify-it-py" +version = "2.0.3" +description = "Links recognition library with FULL unicode support." +optional = false +python-versions = ">=3.7" +files = [ + {file = "linkify-it-py-2.0.3.tar.gz", hash = "sha256:68cda27e162e9215c17d786649d1da0021a451bdc436ef9e0fa0ba5234b9b048"}, + {file = "linkify_it_py-2.0.3-py3-none-any.whl", hash = "sha256:6bcbc417b0ac14323382aef5c5192c0075bf8a9d6b41820a2b66371eac6b6d79"}, +] + +[package.dependencies] +uc-micro-py = "*" + +[package.extras] +benchmark = ["pytest", "pytest-benchmark"] +dev = ["black", "flake8", "isort", "pre-commit", "pyproject-flake8"] +doc = ["myst-parser", "sphinx", "sphinx-book-theme"] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "markdown" +version = "3.7" +description = "Python implementation of John Gruber's Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"}, + {file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"}, +] + +[package.extras] +docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] +testing = ["coverage", "pyyaml"] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + [[package]] name = "markupsafe" version = "2.1.5" @@ -2022,7 +2122,6 @@ files = [ contourpy = ">=1.0.1" cycler = ">=0.10" fonttools = ">=4.22.0" -importlib-resources = {version = ">=3.2.0", markers = "python_version < \"3.10\""} kiwisolver = ">=1.3.1" numpy = ">=1.23" packaging = ">=20.0" @@ -2076,6 +2175,36 @@ files = [ {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, ] +[[package]] +name = "mdit-py-plugins" +version = "0.4.2" +description = "Collection of plugins for markdown-it-py" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"}, + {file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"}, +] + +[package.dependencies] +markdown-it-py = ">=1.0.0,<4.0.0" + +[package.extras] +code-style = ["pre-commit"] +rtd = ["myst-parser", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + [[package]] name = "mistune" version = "3.0.2" @@ -2219,7 +2348,6 @@ files = [ beautifulsoup4 = "*" bleach = "!=5.0.0" defusedxml = "*" -importlib-metadata = {version = ">=3.6", markers = "python_version < \"3.10\""} jinja2 = ">=3.0" jupyter-core = ">=4.7" jupyterlab-pygments = "*" @@ -2274,6 +2402,25 @@ files = [ {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, ] +[[package]] +name = "networkx" +version = "3.4.2" +description = "Python package for creating and manipulating graphs and networks" +optional = false +python-versions = ">=3.10" +files = [ + {file = "networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f"}, + {file = "networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1"}, +] + +[package.extras] +default = ["matplotlib (>=3.7)", "numpy (>=1.24)", "pandas (>=2.0)", "scipy (>=1.10,!=1.11.0,!=1.11.1)"] +developer = ["changelist (==0.5)", "mypy (>=1.1)", "pre-commit (>=3.2)", "rtoml"] +doc = ["intersphinx-registry", "myst-nb (>=1.1)", "numpydoc (>=1.8.0)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.15)", "sphinx (>=7.3)", "sphinx-gallery (>=0.16)", "texext (>=0.6.7)"] +example = ["cairocffi (>=1.7)", "contextily (>=1.6)", "igraph (>=0.11)", "momepy (>=0.7.2)", "osmnx (>=1.9)", "scikit-learn (>=1.5)", "seaborn (>=0.13)"] +extra = ["lxml (>=4.6)", "pydot (>=3.0.1)", "pygraphviz (>=1.14)", "sympy (>=1.10)"] +test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"] + [[package]] name = "notebook" version = "7.2.2" @@ -2368,6 +2515,20 @@ files = [ {file = "numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78"}, ] +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "overrides" version = "7.7.0" @@ -2474,6 +2635,60 @@ files = [ {file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"}, ] +[[package]] +name = "panel" +version = "1.5.3" +description = "The powerful data exploration & web app framework for Python." +optional = false +python-versions = ">=3.10" +files = [ + {file = "panel-1.5.3-py3-none-any.whl", hash = "sha256:c2f278d775589198931bd442afeff90bd4f001fff6e353903dd792adf28ab1d7"}, + {file = "panel-1.5.3.tar.gz", hash = "sha256:1280ac768fa88b3bc19282be875c9ad5229cb08089540e9d800dcde62ac5effb"}, +] + +[package.dependencies] +bleach = "*" +bokeh = ">=3.5.0,<3.7.0" +linkify-it-py = "*" +markdown = "*" +markdown-it-py = "*" +mdit-py-plugins = "*" +packaging = "*" +pandas = ">=1.2" +param = ">=2.1.0,<3.0" +pyviz-comms = ">=2.0.0" +requests = "*" +tqdm = "*" +typing-extensions = "*" + +[package.extras] +dev = ["watchfiles"] +fastapi = ["bokeh-fastapi (>=0.1.2)", "fastapi[standard]"] +mypy = ["mypy", "pandas-stubs", "types-bleach", "types-croniter", "types-markdown", "types-psutil", "types-requests", "types-tqdm", "typing-extensions"] +recommended = ["holoviews (>=1.18.0)", "jupyterlab", "matplotlib", "pillow", "plotly"] +tests = ["psutil", "pytest", "pytest-asyncio", "pytest-rerunfailures", "pytest-xdist"] + +[[package]] +name = "param" +version = "2.1.1" +description = "Make your Python code clearer and more reliable by declaring Parameters." +optional = false +python-versions = ">=3.8" +files = [ + {file = "param-2.1.1-py3-none-any.whl", hash = "sha256:81066d040526fbaa44b6419f3e92348fa8856ea44c8d3915e9245937ddabe2d6"}, + {file = "param-2.1.1.tar.gz", hash = "sha256:3b1da14abafa75bfd908572378a58696826b3719a723bc31b40ffff2e9a5c852"}, +] + +[package.extras] +all = ["aiohttp", "cloudpickle", "coverage[toml]", "flake8", "gmpy", "ipython", "jsonschema", "nbsite (==0.8.4)", "nbval", "nest-asyncio", "numpy", "odfpy", "openpyxl", "pandas", "panel", "pre-commit", "pyarrow", "pytest", "pytest-asyncio", "pytest-xdist", "sphinx-remove-toctrees", "tables", "xlrd"] +doc = ["aiohttp", "nbsite (==0.8.4)", "pandas", "panel", "sphinx-remove-toctrees"] +examples = ["aiohttp", "pandas", "panel"] +lint = ["flake8", "pre-commit"] +tests = ["coverage[toml]", "pytest", "pytest-asyncio"] +tests-deser = ["odfpy", "openpyxl", "pyarrow", "tables", "xlrd"] +tests-examples = ["aiohttp", "nbval", "pandas", "panel", "pytest", "pytest-asyncio", "pytest-xdist"] +tests-full = ["aiohttp", "cloudpickle", "coverage[toml]", "gmpy", "ipython", "jsonschema", "nbval", "nest-asyncio", "numpy", "odfpy", "openpyxl", "pandas", "panel", "pyarrow", "pytest", "pytest-asyncio", "pytest-xdist", "tables", "xlrd"] + [[package]] name = "parso" version = "0.8.4" @@ -2672,28 +2887,27 @@ wcwidth = "*" [[package]] name = "psutil" -version = "6.0.0" +version = "5.9.8" description = "Cross-platform lib for process and system monitoring in Python." optional = false -python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ - {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"}, - {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"}, - {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a3dbfb4de4f18174528d87cc352d1f788b7496991cca33c6996f40c9e3c92c"}, - {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6ec7588fb3ddaec7344a825afe298db83fe01bfaaab39155fa84cf1c0d6b13c3"}, - {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e7c870afcb7d91fdea2b37c24aeb08f98b6d67257a5cb0a8bc3ac68d0f1a68c"}, - {file = "psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35"}, - {file = "psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1"}, - {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"}, - {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"}, - {file = "psutil-6.0.0-cp36-cp36m-win32.whl", hash = "sha256:fc8c9510cde0146432bbdb433322861ee8c3efbf8589865c8bf8d21cb30c4d14"}, - {file = "psutil-6.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:34859b8d8f423b86e4385ff3665d3f4d94be3cdf48221fbe476e883514fdb71c"}, - {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"}, - {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"}, - {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"}, - {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"}, + {file = "psutil-5.9.8-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8"}, + {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73"}, + {file = "psutil-5.9.8-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7"}, + {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36"}, + {file = "psutil-5.9.8-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d"}, + {file = "psutil-5.9.8-cp27-none-win32.whl", hash = "sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e"}, + {file = "psutil-5.9.8-cp27-none-win_amd64.whl", hash = "sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631"}, + {file = "psutil-5.9.8-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81"}, + {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421"}, + {file = "psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4"}, + {file = "psutil-5.9.8-cp36-cp36m-win32.whl", hash = "sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee"}, + {file = "psutil-5.9.8-cp36-cp36m-win_amd64.whl", hash = "sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2"}, + {file = "psutil-5.9.8-cp37-abi3-win32.whl", hash = "sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0"}, + {file = "psutil-5.9.8-cp37-abi3-win_amd64.whl", hash = "sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf"}, + {file = "psutil-5.9.8-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8"}, + {file = "psutil-5.9.8.tar.gz", hash = "sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c"}, ] [package.extras] @@ -2895,6 +3109,45 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pyhmmer" +version = "0.10.15" +description = "Cython bindings and Python interface to HMMER3." +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyhmmer-0.10.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:07dc418d9c1d6a97f8f802e94b93a195f6d3ce8bdc552ac52ef16940f4951db3"}, + {file = "pyhmmer-0.10.15-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c2345e46291f2a6f34a470378ab029d04a012dc2571df599236f7c40c6fa7c16"}, + {file = "pyhmmer-0.10.15-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:320a4a3206d203e52ee53c18245d419c39a467ec3096cf3f9c9fb82f570867be"}, + {file = "pyhmmer-0.10.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:02381f7f08b67b727c9179aedda7643daf9c307ff9684f3e8ea2a35d1cc74b02"}, + {file = "pyhmmer-0.10.15-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1eb007ed89e8ab5c50c2093e9e626de9fb670459c25821ead220ed8b860ee8be"}, + {file = "pyhmmer-0.10.15-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ce588a088375a98487b99edfd088b744844f5e8ea8dec5015f74b57bf542aff1"}, + {file = "pyhmmer-0.10.15-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e277af9830e045aff7834fc0d020dd02b62ed38a85b8dd17245f571f819dbb2"}, + {file = "pyhmmer-0.10.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7c060ebaf729b0946ce564a8747dbde440767c2118523f542500762a293d9f00"}, + {file = "pyhmmer-0.10.15-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:40217c90c7c511076f6ce058352d73aa6e46dad12235dda68d365bc6d8ec7fc1"}, + {file = "pyhmmer-0.10.15-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:946713d711201824993b29d5de65b1c9e42b1e8c7ed54dfbac52ea8cae22c5f5"}, + {file = "pyhmmer-0.10.15-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e1bc19978b9a3168bbf78276b4a3015916fa40fb76378e7a8e56686567c986c3"}, + {file = "pyhmmer-0.10.15-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3eab956d8f41d71074477b5124c8ec17cf28cc353d3023b6ed713c9f71e81ca"}, + {file = "pyhmmer-0.10.15-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:470e2899026133a584097be6488adbabed71bf050574de22df88d1f65e365a22"}, + {file = "pyhmmer-0.10.15-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:51ee4820c7afb2d902c7d277924ab13da71e16ba8eef3ed086c62f3aa30449e9"}, + {file = "pyhmmer-0.10.15-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ba573e2791c12b4b5772e93b9023a6557d7e963a2070a3c2dc522e236bed536"}, + {file = "pyhmmer-0.10.15-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1953514d0849e4a4d2e55476b1569350341d470817e4f37c25d24dde6779f1a7"}, + {file = "pyhmmer-0.10.15-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cbd00ee33298f317a4e62d05eaa83b2facc04234374633a11836d1e75a847bfc"}, + {file = "pyhmmer-0.10.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fbdd4a60db4cd4df7958bba89c56c1a252d301a8f01991bae0beb3851b4c80e9"}, + {file = "pyhmmer-0.10.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8c31811c61bf3785ec3e50590302d221e5be8ab3c3e529c29b3a2f82794fb5b9"}, + {file = "pyhmmer-0.10.15-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4102d4c37b6004687ebb3cf79403900ac2ed5c854f8b830b6cd4be04bac579c3"}, + {file = "pyhmmer-0.10.15-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4e773bb20a7f787918439cbdfd1cabb8d40b3e2021be93982c30da5e3d3813ce"}, + {file = "pyhmmer-0.10.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9fdc2064d135458510a0c7517fc8fc0681fcc9064d47871b873b55734765c17e"}, + {file = "pyhmmer-0.10.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa5fa4c957724cf1c9c7cd4d38e0a359aa93817850c7d4fb80277462d5f1cc8e"}, + {file = "pyhmmer-0.10.15-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b877eff56f9a9c22f30c165f6a65dfcbcea651083e4cf8622e5623669265f88a"}, + {file = "pyhmmer-0.10.15-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:58de16aafe9c1a15f02567805ae8519074b6ce4532854c61556b396ee9301aca"}, + {file = "pyhmmer-0.10.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dac03e78302161f80fad63416ab3c230330cdc47e5574196b4e59af3b0a63475"}, + {file = "pyhmmer-0.10.15.tar.gz", hash = "sha256:bf8e97ce8da6fb5850298f3074640f3e998d5a655877f865c1592eb057dc7921"}, +] + +[package.dependencies] +psutil = ">=5.8,<6.0" + [[package]] name = "pyparsing" version = "3.1.4" @@ -2982,6 +3235,25 @@ files = [ {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"}, ] +[[package]] +name = "pyviz-comms" +version = "3.0.3" +description = "A JupyterLab extension for rendering HoloViz content." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pyviz_comms-3.0.3-py3-none-any.whl", hash = "sha256:fd26951eebc7950106d481655d91ba06296d4cf352dffb1d03f88f959832448e"}, + {file = "pyviz_comms-3.0.3.tar.gz", hash = "sha256:fde4a017c2213ecee63a9a6741431c845e42a5c7b1588e4a7ba2e4370c583728"}, +] + +[package.dependencies] +param = "*" + +[package.extras] +all = ["flake8", "jupyterlab (>=4.0,<5.0)", "keyring", "pytest", "rfc3986", "setuptools (>=40.8.0)", "twine"] +build = ["jupyterlab (>=4.0,<5.0)", "keyring", "rfc3986", "setuptools (>=40.8.0)", "twine"] +tests = ["flake8", "pytest"] + [[package]] name = "pywin32" version = "306" @@ -3680,7 +3952,6 @@ babel = ">=2.13" colorama = {version = ">=0.4.6", markers = "sys_platform == \"win32\""} docutils = ">=0.20,<0.22" imagesize = ">=1.3" -importlib-metadata = {version = ">=6.0", markers = "python_version < \"3.10\""} Jinja2 = ">=3.1" packaging = ">=23.0" Pygments = ">=2.17" @@ -4000,6 +4271,37 @@ files = [ {file = "tzdata-2024.1.tar.gz", hash = "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd"}, ] +[[package]] +name = "uc-micro-py" +version = "1.0.3" +description = "Micro subset of unicode data files for linkify-it-py projects." +optional = false +python-versions = ">=3.7" +files = [ + {file = "uc-micro-py-1.0.3.tar.gz", hash = "sha256:d321b92cff673ec58027c04015fcaa8bb1e005478643ff4a500882eaab88c48a"}, + {file = "uc_micro_py-1.0.3-py3-none-any.whl", hash = "sha256:db1dffff340817673d7b466ec86114a9dc0e9d4d9b5ba229d9d60e5c12600cd5"}, +] + +[package.extras] +test = ["coverage", "pytest", "pytest-cov"] + +[[package]] +name = "upsetplot" +version = "0.9.0" +description = "Draw Lex et al.'s UpSet plots with Pandas and Matplotlib" +optional = false +python-versions = "*" +files = [ + {file = "UpSetPlot-0.9.0.tar.gz", hash = "sha256:95b76ac38c624c9dfb1eca1de1a37e30e07e83678b1c57839c943184247b8592"}, +] + +[package.dependencies] +matplotlib = ">=2.0" +pandas = ">=0.23" + +[package.extras] +testing = ["pytest (>=2.7)", "pytest-cov (<2.6)"] + [[package]] name = "uri-template" version = "1.3.0" @@ -4148,25 +4450,6 @@ files = [ {file = "xyzservices-2024.9.0.tar.gz", hash = "sha256:68fb8353c9dbba4f1ff6c0f2e5e4e596bb9e1db7f94f4f7dfbcb26e25aa66fde"}, ] -[[package]] -name = "zipp" -version = "3.20.2" -description = "Backport of pathlib-compatible object wrapper for zip files" -optional = false -python-versions = ">=3.8" -files = [ - {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, - {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, -] - -[package.extras] -check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] -cover = ["pytest-cov"] -doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] -enabler = ["pytest-enabler (>=2.2)"] -test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] -type = ["pytest-mypy"] - [[package]] name = "zope-event" version = "5.0" @@ -4238,5 +4521,5 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" -python-versions = "^3.9" -content-hash = "0bf6c5701add8319a859ce8448836c15ca1e9f789d30deedb3685880acce69cb" +python-versions = ">=3.10,<4.0" +content-hash = "47933750bb654347e34b5f70c61fb77cede83f75f4b0fc4e59dc09d1d3c7339a" diff --git a/missense_kinase_toolkit/databases/pyproject.toml b/missense_kinase_toolkit/databases/pyproject.toml index fd29b5d..763b8c9 100644 --- a/missense_kinase_toolkit/databases/pyproject.toml +++ b/missense_kinase_toolkit/databases/pyproject.toml @@ -19,7 +19,7 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.9" +python = ">=3.10,<4.0" setuptools = "^70.0.0" pydantic = "^2.7.4" tqdm = "4.66.3" @@ -36,6 +36,11 @@ bokeh = "^3.4.1" seaborn = "^0.13.2" scipy = "^1.13.1" matplotlib-venn = "^1.1.1" +pyhmmer = "^0.10.15" +biotite = "^1.0.1" +openpyxl = "^3.1.5" +upsetplot = "^0.9.0" +panel = "^1.5.3" [tool.poetry.group.dev.dependencies] black = "^24.3.0" diff --git a/missense_kinase_toolkit/ml/src/esm2/analysis.py b/missense_kinase_toolkit/ml/src/esm2/analysis.py new file mode 100644 index 0000000..9737a56 --- /dev/null +++ b/missense_kinase_toolkit/ml/src/esm2/analysis.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python + +import glob +import os + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns + +# os.chdir("/data1/tanseyw/projects/whitej/esm_km_atp/src") +from utils import invert_zscore, load_csv2dataset, parse_stats_dataframes + +path = "/data1/tanseyw/projects/whitej/esm_km_atp/" +list_files = glob.glob(os.path.join(path, "*/fold-*.csv")) +df = pd.read_csv(os.path.join(path, "assets/pkis2_km_atp.csv")) +# log10 transform before z-scoring +labels = df["ATP Conc.(uM)"].apply(np.log10) + +list_runs = [ + "5CV-KinCore-esm2_t6_8M_UR50D", + "5CV-KLIFS_MIN-esm2_t6_8M_UR50D", + "5CV-KLIFS_FULL-esm2_t6_8M_UR50D", +] + +list_logs = [ + glob.glob(os.path.join(path, run, "*/logs/fold-*.csv")) for run in list_runs +] + +dict_logs = dict(zip(list_runs, list_logs)) + +# Load and process training and evaluation loss # + +df_train, df_eval, df_final = pd.DataFrame(), pd.DataFrame(), pd.DataFrame() +for exp, list_file in dict_logs.items(): + for idx, file in enumerate(list_file): + df_train_temp, df_eval_temp, df_final_temp = parse_stats_dataframes(file, idx) + df_train_temp["exp"] = exp + df_eval_temp["exp"] = exp + df_final_temp["exp"] = exp + df_train = pd.concat([df_train, df_train_temp]).reset_index(drop=True) + df_eval = pd.concat([df_eval, df_eval_temp]).reset_index(drop=True) + df_final = pd.concat([df_final, df_final_temp]).reset_index(drop=True) + +# Load validation and training datasets # + +# TODO: Look at all files +ds_val, ds_train = load_csv2dataset(path, 5, "KLIFS_FULL_data.csv") +df_val = pd.DataFrame() +for idx, ds in enumerate(ds_val): + df_val_temp = ds.to_pandas() + df_val_temp["fold"] = idx + 1 + df_val = pd.concat([df_val, df_val_temp]).reset_index(drop=True) + +# Plot training loss # + +list_fold = df_train["fold"].unique().tolist() +list_replace = [f"Fold: {i}\n(n = {sum(df_val["fold"] != i)})" for i in list_fold] +df_train["fold_label"] = df_train["fold"].map(dict(zip(list_fold, list_replace))) +g = sns.FacetGrid( + df_train, col="fold_label", row="exp", hue="exp", sharey=True, sharex=True +) +g.map(sns.lineplot, "step", "loss") +g.add_legend() +g.set_axis_labels("Steps", "Training Loss") +g.set_titles("{col_name}") +plt.savefig(os.path.join(path, "images/train_loss_2024.10.30.png")) + +# list_fold = df_train["fold"].unique().tolist() +# list_replace = [f"Fold: {i}\n(n = {sum(df_val["fold"] != i)})" for i in list_fold] +# df_train["fold_label"] = df_train["fold"].map(dict(zip(list_fold, list_replace))) +# g = sns.FacetGrid(df_train, col="fold_label", hue="fold", sharey=False, sharex=False) +# g.map(sns.lineplot, "step", "loss") +# g.set_axis_labels("Steps", "Training Loss") +# g.set_titles('{col_name}') +# plt.savefig(os.path.join(path, "images/train_loss.png")) + +# Plot evaluation RMSE # + +list_fold = df_eval["fold"].unique().tolist() +list_replace = [f"Fold: {i}\n(n = {sum(df_val["fold"] == i)})" for i in list_fold] +df_eval["fold_label"] = df_eval["fold"].map(dict(zip(list_fold, list_replace))) +df_eval["log_rmse"] = invert_zscore(df_eval["eval_rmse"], labels) +df_eval["orig_rmse"] = df_eval["log_rmse"].apply(lambda x: 10**x) + +# list_fold = df_eval["fold"].unique().tolist() +# list_replace = [f"Fold: {i}\n(n = {sum(df_val["fold"] == i)})" for i in list_fold] +# df_eval["fold_label"] = df_eval["fold"].map(dict(zip(list_fold, list_replace))) +# df_eval["orig_rmse"] = invert_zscore(df_eval["eval_rmse"], labels) +# df_eval["orig_rmse"] = df_eval["orig_rmse"].apply(lambda x: 10 ** x) + +# Leave RMSE in units of z-score log10(Km, ATP) + +sns.set(font_scale=1.5) +df_eval["exp_label"] = df_eval["exp"].map( + dict(zip(list_runs, ["KinCore KD", "KLIFS Pocket", "KLIFS Full Region"])) +) +g = sns.FacetGrid(df_eval, col="fold_label", hue="exp_label", sharey=True, sharex=True) +# for ax in g.axes.flat: +# ax.axvline(500, color='r', linestyle='dashed', linewidth=1) +g.grid(False) +g.map(sns.lineplot, "step", "log_rmse") +g.set_axis_labels("Steps", "Held-Out RMSE\n" + r"$(log_{10} K_{M, ATP})$") +g.set_titles("{col_name}") +g.add_legend(title="Input sequence") +# g.figsize(8, 6) +plt.savefig(os.path.join(path, "images/eval_rmse_unconverted_2024.10.30.png")) + +# g = sns.FacetGrid(df_eval, col="fold_label", hue="fold", sharey=False, sharex=False) +# for ax in g.axes.flat: +# ax.axvline(500, color='r', linestyle='dashed', linewidth=1) +# g.map(sns.lineplot, "step", "eval_rmse") +# g.set_axis_labels("Steps", "RMSE, Eval (Unconverted)") +# g.set_titles('{col_name}') +# plt.savefig(os.path.join(path, "images/eval_rmse_unconverted.png")) + +# Convert RMSE to original scale + +g = sns.FacetGrid(df_eval, col="fold_label", hue="fold", sharey=False, sharex=False) +for ax in g.axes.flat: + ax.axvline(500, color="r", linestyle="dashed", linewidth=1) +g.map(sns.lineplot, "step", "orig_rmse") +g.set_axis_labels("Steps", "RMSE, Eval (Converted)") +g.set_titles("{col_name}") +plt.savefig(os.path.join(path, "images/eval_rmse_converted.png")) + +# Plot histogram of labels for validation set # + +list_fold = df_val["fold"].unique().tolist() +list_replace = [f"Fold: {i}\n(n = {sum(df_val["fold"] == i)})" for i in list_fold] +df_val["fold_label"] = df_val["fold"].map(dict(zip(list_fold, list_replace))) +df_val["orig_label"] = invert_zscore(df_val["label"], labels) +df_val["orig_label"] = df_val["orig_label"].apply(lambda x: 10**x) + +# Leave labels in units of z-score log10(Km, ATP) + +g = sns.FacetGrid(df_val, col="fold_label", hue="fold") +g.map(plt.hist, "label") +g.set_axis_labels("z-score, $log_{10}$Km, ATP", "Frequency") +y, x, _ = plt.hist(df_val["label"]) +for idx, ax in enumerate(g.axes.flat): + loc = df_val.loc[df_val["fold"] == idx + 1, "label"].mean() + ax.axvline(loc, color="r", linestyle="dashed", linewidth=1) + ax.text( + loc + (x.max() - x.min()) * 0.1, + y.max() * 0.9, + "Mean: " + str(round(loc, 2)), + color="r", + ) +g.set_titles("{col_name}") +plt.savefig(os.path.join(path, "images/val_label_hist_zscore.png"), bbox_inches="tight") + +# Convert labels to original scale + +g = sns.FacetGrid(df_val, col="fold_label", hue="fold") +g.map(plt.hist, "orig_label") +g.set_axis_labels("Km, ATP", "Frequency") +y, x, _ = plt.hist(df_val["orig_label"]) +for idx, ax in enumerate(g.axes.flat): + loc = df_val.loc[df_val["fold"] == idx + 1, "label"].mean() + ax.axvline(loc, color="r", linestyle="dashed", linewidth=1) + ax.text( + loc + (x.max() - x.min()) * 0.1, + y.max() * 0.9, + "Mean: " + str(round(loc, 2)), + color="r", + ) +g.set_titles("{col_name}") +plt.savefig(os.path.join(path, "images/val_label_hist_orig.png"), bbox_inches="tight") + + +# import numpy as np +# from utils import calc_zscore +# df = pd.read_csv(os.path.join(path, "assets/pkis2_km_atp.csv")) +# calc_zscore(df["ATP Conc.(uM)"].apply(np.log10)) +# df.head() +# df["kd"].apply(len).max() diff --git a/missense_kinase_toolkit/ml/src/esm2/batch_jobs.csv b/missense_kinase_toolkit/ml/src/esm2/batch_jobs.csv new file mode 100644 index 0000000..103b33f --- /dev/null +++ b/missense_kinase_toolkit/ml/src/esm2/batch_jobs.csv @@ -0,0 +1,3 @@ +facebook/esm2_t6_8M_UR50D,seq_kincore,5CV-KinCore-esm2_t6_8M_UR50D +facebook/esm2_t6_8M_UR50D,seq_klifs_min,5CV-KLIFS_MIN-esm2_t6_8M_UR50D +facebook/esm2_t6_8M_UR50D,seq_klifs_full,5CV-KLIFS_FULL-esm2_t6_8M_UR50D diff --git a/missense_kinase_toolkit/ml/src/esm2/batch_jobs.sh b/missense_kinase_toolkit/ml/src/esm2/batch_jobs.sh new file mode 100644 index 0000000..c0ef8d6 --- /dev/null +++ b/missense_kinase_toolkit/ml/src/esm2/batch_jobs.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#SBATCH --partition=componc_gpu +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --mem-per-cpu=7G +#SBATCH --gpus-per-task=1 +#SBATCH --time=12:00:00 +#SBATCH --job-name=batch_esm_km_atp +#SBATCH --output=/data1/tanseyw/projects/whitej/esm_km_atp/src/stdout/%x_%j.out +#SBATCH --error=/data1/tanseyw/projects/whitej/esm_km_atp/src/stderr/%x_%j.err + +while IFS=, read -r model col_seq run_name +do + sbatch -J ${run_name} run.sh ${model} ${col_seq} ${run_name} +done < batch_jobs.csv diff --git a/missense_kinase_toolkit/ml/src/esm2/inference.py b/missense_kinase_toolkit/ml/src/esm2/inference.py new file mode 100644 index 0000000..48af1a6 --- /dev/null +++ b/missense_kinase_toolkit/ml/src/esm2/inference.py @@ -0,0 +1,64 @@ +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from transformers import AutoTokenizer, EsmForSequenceClassification + +path_model = "/data1/tanseyw/projects/whitej/esm_km_atp/5CV-KLIFS_MIN-esm2_t6_8M_UR50D/full/results/checkpoint-12500" + +device = "cpu" + +model = EsmForSequenceClassification.from_pretrained(path_model).to(device) +tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D") + +df_klifs_zehir_muts_alphamissense = pd.read_csv( + "/data1/tanseyw/projects/whitej/esm_km_atp/assets/klifs_zehir_muts_alphamissense.csv" +) + +list_outputs = [] +for _, row in df_klifs_zehir_muts_alphamissense.iterrows(): + inputs = tokenizer.encode(row["klifs"], return_tensors="pt").to(device) + outputs = model.forward(inputs).logits.detach().numpy()[0][0] + list_outputs.append(outputs) + +dict_outputs = dict(zip(df_klifs_zehir_muts_alphamissense["hgnc_name"], list_outputs)) + +dict_muts = {i: None for i in dict_outputs.keys() if "_" in i} +for key, value in dict_outputs.items(): + if "_" in key: + wt = key.split("_")[0] + dict_muts[key] = (dict_outputs[key] - dict_outputs[wt]) / dict_outputs[wt] + +df_klifs_zehir_muts_alphamissense["zscore_percent_change"] = ( + df_klifs_zehir_muts_alphamissense["hgnc_name"].apply( + lambda x: dict_muts[x] * 100 if x in dict_muts.keys() else None + ) +) +df_klifs_zehir_muts_alphamissense["zscore_percent_change_log"] = ( + df_klifs_zehir_muts_alphamissense["zscore_percent_change"].apply( + lambda x: np.sign(x) * np.log10(np.abs(x)) + ) +) + + +sns.set(font_scale=2) +sns.set_style(style="white") +plt.figure(figsize=(20, 7)) +# ax = sns.scatterplot(data = df_klifs_zehir_muts_alphamissense, x = "alphamissense_score", y = "zscore_percent_change", hue = "alphamissense_class") +ax = sns.scatterplot( + data=df_klifs_zehir_muts_alphamissense, + x="alphamissense_score", + y="zscore_percent_change_log", + hue="alphamissense_class", +) +# plt.axhline(y=0, color='red', linestyle='--') +# plt.yscale('log') +plt.legend(title="Alphamissense Class") +plt.xlabel("Alphamissense Score") +# plt.ylabel(" Predicted Z-score\n% Change vs. Wild-Type") +plt.ylabel(r"$log_{10}$" + " Predicted Z-score\n% Change vs. Wild-Type") +plt.savefig( + "/data1/tanseyw/projects/whitej/esm_km_atp/images/zscore_percent_change_vs_alphamissense_score_log.png", + bbox_inches="tight", +) +# plt.savefig("/data1/tanseyw/projects/whitej/esm_km_atp/images/zscore_percent_change_vs_alphamissense_score.png", bbox_inches = "tight") diff --git a/missense_kinase_toolkit/ml/src/esm2/main.py b/missense_kinase_toolkit/ml/src/esm2/main.py new file mode 100644 index 0000000..3050d4b --- /dev/null +++ b/missense_kinase_toolkit/ml/src/esm2/main.py @@ -0,0 +1,152 @@ +#!/usr/bin/env python + +import os + +import pandas as pd +from datasets import load_dataset +from transformers import ( + AutoTokenizer, + EsmForSequenceClassification, + Trainer, + TrainingArguments, +) +from utils import ( # save_stats, + compute_metrics, + load_csv2dataset, + parsearg_utils, + save_csv2csv, +) + + +def main(): + args = parsearg_utils() + + if args.wandbRun != "": + csv_out = args.wandbRun.split("-")[1] + "_data.csv" + else: + csv_out = None + + # load data + df = pd.read_csv(os.path.join(args.path, args.inputData)) + csv_out = save_csv2csv( + df=df, + path=args.path, + csv_name=csv_out, + seed=args.seed, + col_seq=args.columnSeq, + ) + + # split data + ds_val, ds_train = load_csv2dataset( + args.path, + args.kFold, + csv_name=csv_out, + ) + + # full dataset + file_path = os.path.join(args.path, "assets", csv_out) + + ds_full = load_dataset("csv", data_files=file_path) + ds_val.append(ds_full["train"]) + ds_train.append(ds_full["train"]) + + # encode data + max_len = df[args.columnSeq].str.len().max() + dict_token_args = { + "return_tensors": "pt", + "padding": True, + "truncation": True, + "max_length": max_len, + } + tokenizer = AutoTokenizer.from_pretrained(args.model) + # https://huggingface.co/docs/datasets/v1.5.0/processing.html + col_seq = "seq" + encode_val = [ + dataset.map(lambda x: tokenizer(x[col_seq], **dict_token_args), batched=True) + for dataset in ds_val + ] + encode_val = [dataset.remove_columns([col_seq]) for dataset in encode_val] + encode_train = [ + dataset.map(lambda x: tokenizer(x[col_seq], **dict_token_args), batched=True) + for dataset in ds_val + ] + encode_train = [dataset.remove_columns([col_seq]) for dataset in encode_train] + + # create dictionary for encoding + dict_names = [f"fold-{int(i) + 1}" for i in range(args.kFold)] + dict_names.append("full") + + dict_encode = { + name: {"val": val, "train": train} + for name, val, train in zip(dict_names, encode_val, encode_train) + } + + if args.wandbProject != "": + os.environ["WANDB_PROJECT"] = args.wandbProject + os.environ["WANDB_LOG_MODEL"] = "checkpoint" + + for key, dataset in dict_encode.items(): + # create sub-directories for each fold + if args.wandbRun != "": + path_wandb = os.path.join(args.path, args.wandbRun) + if not os.path.exists(path_wandb): + os.mkdir(path_wandb) + path_main = os.path.join(path_wandb, str(key)) + else: + path_main = os.path.join(args.path, str(key)) + path_results = os.path.join(path_main, "results") + path_logs = os.path.join(path_main, "logs") + for path in [path_main, path_results, path_logs]: + if not os.path.exists(path): + os.mkdir(path) + + # training arguments + dict_training_args = { + "learning_rate": args.learningRate, + "num_train_epochs": args.epochs, + "per_device_train_batch_size": args.tBatch, + "per_device_eval_batch_size": args.vBatch, + "warmup_steps": args.warmup, + "weight_decay": args.weightDecay, + "output_dir": path_results, + "overwrite_output_dir": args.overwrite, + "save_total_limit": args.saveLim, + "evaluation_strategy": args.evalStrategy, + "save_strategy": args.saveStrategy, + "load_best_model_at_end": args.loadBest, + "logging_dir": path_logs, + "logging_steps": args.loggingSteps, + } + if args.wandbProject != "": + dict_training_args["report_to"] = "wandb" + if args.wandbRun != "": + dict_training_args["run_name"] = args.wandbRun + training_args = TrainingArguments(**dict_training_args) + + # load model; num_labels=1 for regression + model = EsmForSequenceClassification.from_pretrained( + args.model, num_labels=1, problem_type="regression" + ) + + # set trainer + trainer = Trainer( + model=model, + args=training_args, + compute_metrics=compute_metrics, + train_dataset=dataset["train"].with_format("torch"), + eval_dataset=dataset["val"].with_format("torch"), + ) + # try: + # trainer.train(resume_from_checkpoint=True) + # except: + # trainer.train() + trainer.train() + + pd.DataFrame(trainer.state.log_history).to_csv( + os.path.join(path_logs, f"{key}_trainer_state_log.csv"), index=False + ) + # save_stats(trainer, path) + + +if __name__ == "__main__": + main() diff --git a/missense_kinase_toolkit/ml/src/esm2/run.sh b/missense_kinase_toolkit/ml/src/esm2/run.sh new file mode 100644 index 0000000..e98521b --- /dev/null +++ b/missense_kinase_toolkit/ml/src/esm2/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash +#SBATCH --partition=componc_gpu +#SBATCH -N 1 +#SBATCH -n 1 +#SBATCH --mem-per-cpu=7G +#SBATCH --gpus-per-task=1 +#SBATCH --time=12:00:00 +#SBATCH --output=/data1/tanseyw/projects/whitej/esm_km_atp/src/stdout/%x_%j.out +#SBATCH --error=/data1/tanseyw/projects/whitej/esm_km_atp/src/stderr/%x_%j.err + +# take command line arguments +if [ "$#" -ne 3 ]; then + echo "You must enter exactly 3 command line arguments: " + exit +fi + +MODEL=$1 +COL_SEQ=$2 +RUN_NAME=$3 + +source ~/.bashrc +mamba activate hf_torch + +python main.py \ + --model ${MODEL} \ + --columnSeq ${COL_SEQ} \ + --wandbRun ${RUN_NAME} diff --git a/missense_kinase_toolkit/ml/src/esm2/utils.py b/missense_kinase_toolkit/ml/src/esm2/utils.py new file mode 100644 index 0000000..47efec5 --- /dev/null +++ b/missense_kinase_toolkit/ml/src/esm2/utils.py @@ -0,0 +1,370 @@ +import argparse +import datetime +import os + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from datasets import Dataset, load_dataset +from sklearn.metrics import mean_squared_error + + +def parsearg_utils(): + """Argument parser to finetune ESM-2 model from HuggingFace.""" + + parser = argparse.ArgumentParser( + description="Run ESM-2 model from transformer library on PKIS2 Km, ATP data." + ) + + parser.add_argument( + "-b", + "--loadBest", + help="Load best model at end (bool)", + default=True, + type=bool, + ) + + parser.add_argument( + "-c", + "--columnSeq", + help="Column containing (str; default: kd)", + default="seq_kincore", + type=str, + ) + + parser.add_argument( + "-d", + "--weightDecay", + help="Weight decay (float; default: 0.1)", + default=0.1, + type=float, + ) + + parser.add_argument( + "--inputData", + help="Path to csv file to load (str)", + default="assets/pkis2_km_atp.csv", + type=str, + ) + + parser.add_argument( + "-e", + "--epochs", + help="Number of training epochs (int; default: 500)", + default=500, + type=int, + ) + + parser.add_argument( + "-g", + "--loggingSteps", + help="Logging steps (int; default: 1)", + default=1, + type=int, + ) + + parser.add_argument( + "-k", + "--kFold", + help="K-fold (int; default: 5)", + default=5, + type=int, + ) + + parser.add_argument( + "-l", + "--saveLim", + help="Save total limit (int; default: 2)", + default=2, + type=int, + ) + + parser.add_argument( + "-m", + "--model", + help="Model name (str; default: facebook/esm2_t6_8M_UR50D)", + default="facebook/esm2_t6_8M_UR50D", + type=str, + ) + + parser.add_argument( + "-n", + "--noSplit", + help="Model name (store_true; default: False)", + action="store_true", + ) + + parser.add_argument( + "-o", + "--overwrite", + help="Overwrite output directory (bool; default: True)", + default=True, + type=bool, + ) + + parser.add_argument( + "-p", + "--path", + help="Path to save data model and data, if applicable (str)", + default="/data1/tanseyw/projects/whitej/esm_km_atp", + type=str, + ) + + parser.add_argument( + "-r", + "--learningRate", + help="Learning rate (float; default: 0.000001)", + default=0.000001, + type=float, + ) + + parser.add_argument( + "-s", + "--seed", + help="Random seed (int; default: 42)", + default=42, + type=int, + ) + + parser.add_argument( + "-t", + "--tBatch", + help="Training batch size (int; default: 16)", + default=8, + type=int, + ) + + parser.add_argument( + "-v", + "--vBatch", + help="Validation batch size (int; default: 16)", + default=8, + type=int, + ) + + parser.add_argument( + "-w", + "--warmup", + help="Number of warm-up steps (int; default: 500)", + default=500, + type=int, + ) + + parser.add_argument( + "--evalStrategy", + help="Evaluation strategy (str; default: steps)", + default="steps", + type=str, + ) + + parser.add_argument( + "--saveStrategy", + help="Save strategy (str; default: steps)", + default="steps", + type=str, + ) + + parser.add_argument( + "--wandbProject", + help="Weights and Biases project (str; default: seq_atp_affinity)", + default="seq_atp_affinity", + type=str, + ) + + parser.add_argument( + "--wandbRun", + help='Weights and Biases run (str; default: "")', + default="", + type=str, + ) + + args = parser.parse_args() + + return args + + +def calc_zscore( + list_in: list[float | int], +) -> list[float]: + """Calculate z-scores for a list of values.""" + mean = sum(list_in) / len(list_in) + std = (sum([(x - mean) ** 2 for x in list_in]) / (len(list_in) - 1)) ** 0.5 + list_out = [(x - mean) / std for x in list_in] + return list_out + + +def invert_zscore( + list_zscore: list[float], + list_orig: list[float], +): + """Convert back to original scale from z-scores.""" + mean = sum(list_orig) / len(list_orig) + std = (sum([(x - mean) ** 2 for x in list_orig]) / (len(list_orig) - 1)) ** 0.5 + list_out = [(z * std) + mean for z in list_zscore] + return list_out + + +def save_csv2csv( + df: pd.DataFrame, + path: str, + csv_name: str | None = None, + seed: int = 42, + col_seq: str = "kd", + col_lab: str = "ATP Conc.(uM)", +): + """ + Process data for ESM-2 model from HuggingFace. + Extracts sequence and labels and saves as Dataset in assets sub-dir. + + Parameters + ---------- + df : pd.DataFrame + Dataframe with columns for sequence and label. + path : str + Path to save data. + csv_name : str + Name of csv file; default is None. + seed : int + Random seed. + col_seq : str + Column name for sequence; default is "kd". + col_lab : str + Column name for label; default is "ATP Conc.(uM)". + + Returns + ------- + None + """ + df = df.loc[df["Mutant"].apply(lambda x: x is False),].reset_index(drop=True) + df_shuffle = df.copy().sample(frac=1, random_state=seed).reset_index(drop=True) + df_out = df_shuffle[[col_seq, col_lab]] + df_out.columns = ["seq", "label"] + # df_out["label"] = df_out["label"].astype(float) + df_out["label"] = calc_zscore(df_out["label"].apply(np.log10)) + + if csv_name is None: + x = datetime.datetime.now() + csv_name = f"{x.strftime('%Y%m%d_%H%M%S')}_data.csv" + + data = Dataset.from_pandas(df_out) + data.to_csv(os.path.join(path, "assets", csv_name), index=False) + + return csv_name + + +def load_csv2dataset( + path: str, + k_fold: int, + csv_name: str, +): + """Load data from csv file to dataset.""" + k_interval = int(100 / k_fold) + file_path = os.path.join(path, "assets", csv_name) + list_val = [f"train[{k}%:{k+k_interval}%]" for k in range(0, 100, k_interval)] + list_train = [ + f"train[:{k}%]+train[{k+k_interval}%:]" for k in range(0, 100, k_interval) + ] + + ds_val = load_dataset("csv", data_files=file_path, split=list_val) + ds_train = load_dataset("csv", data_files=file_path, split=list_train) + + return ds_val, ds_train + + +def compute_metrics(eval_pred): + predictions, labels = eval_pred + rmse = mean_squared_error(labels, predictions, squared=False) + return {"rmse": rmse} + + +def parse_stats_dataframes( + file: str, + idx: int | None = None, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Save training, evaluation, and final stats from trainer state log to dataframes. + + Parameters + ---------- + file : str + File name. + idx : int | None + Index of file (e.g., split). If None, no split annotation will be added + """ + df = pd.read_csv(file) + + df_train = df.loc[[i for i in range(0, df.shape[0] - 1, 2)],] + df_eval = df.loc[[i for i in range(1, df.shape[0], 2)],] + df_final = pd.DataFrame(df.iloc[-1]).T + + if idx is not None: + for df in (df_train, df_eval, df_final): + df["fold"] = idx + 1 + + for df in (df_train, df_eval, df_final): + df.dropna(axis=1, how="all", inplace=True) + + return df_train, df_eval, df_final + + +# TODO add labels as parameter +def plot_label_histogram( + val_df: pd.DataFrame, + bool_orig: bool = True, + labels: list[float] | None = None, + path: str = "/data1/tanseyw/projects/whitej/esm_km_atp/", +): + """Plot histograms of labels for validation set. + + Parameters + ---------- + val_df : pd.DataFrame + Validation dataframe from trainer state log. + bool_orig : bool + If True, plot labels in original scale. + labels : list[float] | None + List of labels for original scale. + + Returns + ------- + None + """ + list_fold = val_df["fold"].unique().tolist() + list_replace = [f"Fold: {i}\n(n = {sum(val_df["fold"] == i)})" for i in list_fold] + val_df["fold_label"] = val_df["fold"].map(dict(zip(list_fold, list_replace))) + + if bool_orig and labels is not None: + val_df["orig_label"] = invert_zscore(val_df["label"], labels) + val_df["orig_label"] = val_df["orig_label"].apply(lambda x: 10**x) + + g = sns.FacetGrid(val_df, col="fold_label", hue="fold") + + if bool_orig: + g.map(plt.hist, "orig_label") + g.set_axis_labels("Km, ATP", "Frequency") + y, x, _ = plt.hist(val_df["orig_label"]) + else: + g.map(plt.hist, "label") + g.set_axis_labels("z-score, $log_{10}$Km, ATP", "Frequency") + y, x, _ = plt.hist(val_df["label"]) + + for idx, ax in enumerate(g.axes.flat): + loc = val_df.loc[val_df["fold"] == idx + 1, "label"].mean() + ax.axvline(loc, color="r", linestyle="dashed", linewidth=1) + ax.text( + loc + (x.max() - x.min()) * 0.1, + y.max() * 0.9, + "Mean: " + str(round(loc, 2)), + color="r", + ) + + g.set_titles("{col_name}") + + if bool_orig: + plt.savefig( + os.path.join(path, "images/val_label_hist_orig.png"), bbox_inches="tight" + ) + else: + plt.savefig( + os.path.join(path, "images/val_label_hist_zscore.png"), bbox_inches="tight" + ) diff --git a/notebooks/klifs_pocket.ipynb b/notebooks/klifs_pocket.ipynb index 413bfa7..17835d4 100644 --- a/notebooks/klifs_pocket.ipynb +++ b/notebooks/klifs_pocket.ipynb @@ -1,5 +1,16 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6705b1d9-0212-4ebb-95ae-abf1bd627899", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, { "cell_type": "markdown", "id": "bef66584-0146-4b29-8914-8b093631b3a3", @@ -10,7 +21,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "feb84d68-9fc9-4c4b-94f1-93fc5205deba", "metadata": {}, "outputs": [], @@ -60,7 +71,7 @@ "# check that all KLIFS2UniProt alignments working as expected - should be no output\n", "for hgnc, pocket in dict_klifs.items():\n", " list_temp = [pocket.UniProt.canonical_seq[val-1] == pocket.KLIFS.pocket_seq[idx] \\\n", - " for idx, (key, val) in enumerate(pocket.KLIFS2UniProt.items()) if val is not None]\n", + " for idx, (key, val) in enumerate(pocket.KLIFS2UniProtIdx.items()) if val is not None]\n", " not any(list_temp)" ] }, @@ -74,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "3369c424-3727-449a-97c7-1a07114a1541", "metadata": {}, "outputs": [], @@ -86,7 +97,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "2cb0372b-dcbd-4f97-bdff-2045a46e87a8", "metadata": {}, "outputs": [], @@ -97,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "88f3481e-6b5a-410f-9061-330820f50f9a", "metadata": {}, "outputs": [], @@ -110,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "c61efdee-43bf-47d0-993f-cf5a1bcee962", "metadata": {}, "outputs": [], @@ -121,7 +132,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "7a0a522f-3adb-4d37-9f82-2c5f3b4d8ec0", "metadata": {}, "outputs": [ @@ -272,6 +283,65 @@ " plt.title(f\"{cols} Color Scheme\")\n", " plt.savefig(f\"KLIFS_pocket_{cols}.pdf\", bbox_inches=\"tight\");" ] + }, + { + "cell_type": "markdown", + "id": "ee97336c-6a6a-4f43-9592-50fc626fcb14", + "metadata": {}, + "source": [ + "## Assess inter-KLIFS region" + ] + }, + { + "cell_type": "markdown", + "id": "4ce20718-12d5-430b-9c16-6663e2c00a4f", + "metadata": {}, + "source": [ + "### In progress" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "23a9b967-16eb-4290-ba8d-ff597504da0b", + "metadata": {}, + "outputs": [], + "source": [ + "dict_aligned = kinase_schema.align_inter_intra_region(dict_klifs)\n", + "dict_replace = kinase_schema.reverse_order_dict_of_dict(dict_aligned)\n", + "\n", + "for key in dict_klifs.keys():\n", + " dict_klifs[key].KLIFS2UniProtSeq.update(dict_replace[key])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "8eb48a13-29dc-4ea5-ac73-d8a028e0af8f", + "metadata": {}, + "outputs": [], + "source": [ + "# TEST\n", + "for hgnc, klifs_pocket in dict_klifs.items():\n", + " idx1 = min([i for i in klifs_pocket.KLIFS2UniProtIdx.values() if i is not None])\n", + " idx2 = max([i for i in klifs_pocket.KLIFS2UniProtIdx.values() if i is not None])\n", + " str1 = klifs_pocket.UniProt.canonical_seq[idx1-1:idx2]\n", + " str2 = \"\".join([*klifs_pocket.KLIFS2UniProtSeq.values()]).replace(\"-\", \"\")\n", + " # str2 = \"\".join([i for i in dict_klifs[hgnc].KLIFS2UniProtSeq.values() if i is not None]).replace(\"-\", \"\")\n", + " if str1 != str2:\n", + " print(hgnc)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "01832b58-5a0f-4d4f-b8de-e4f567757945", + "metadata": {}, + "outputs": [], + "source": [ + "dict_seq = {hgnc: \"\".join([*klifs_pocket.KLIFS2UniProtSeq.values()])\\\n", + " for hgnc, klifs_pocket in dict_klifs.items()}" + ] } ], "metadata": { diff --git a/notebooks/pkis2_km_atp.ipynb b/notebooks/pkis2_km_atp.ipynb new file mode 100644 index 0000000..151ca6d --- /dev/null +++ b/notebooks/pkis2_km_atp.ipynb @@ -0,0 +1,1336 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "bad4e7d7-78ea-4f11-aa61-6778a26a74cb", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "390a7844-83f1-4ac6-86a6-eb94d9f96426", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "from upsetplot import from_contents, plot\n", + "from tqdm.notebook import tqdm\n", + "\n", + "from missense_kinase_toolkit.databases import (\n", + " cbioportal, \n", + " hgnc, \n", + " kinase_schema,\n", + ")\n", + "\n", + "from missense_kinase_toolkit.databases.utils import (\n", + " create_strsplit_list,\n", + " return_list_out,\n", + " try_except_match_str2dict,\n", + " replace_string_using_dict,\n", + " try_except_convert_str2int,\n", + ")\n", + "\n", + "from missense_kinase_toolkit.databases import (\n", + " uniprot,\n", + " pfam,\n", + ")\n", + "\n", + "from missense_kinase_toolkit.databases.colors import DICT_COLORS\n", + "from missense_kinase_toolkit.databases.plot import SequenceAlignment\n", + "from missense_kinase_toolkit.databases.protvar import ProtvarScore" + ] + }, + { + "cell_type": "markdown", + "id": "46d77af1-541c-4a25-a254-8b1fcc0deb40", + "metadata": {}, + "source": [ + "# Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c77dca44-03b4-4bf8-b621-ee8037f98b7d", + "metadata": {}, + "outputs": [], + "source": [ + "df_pkis = pd.read_excel(\"../data/3. PKIS Nanosyn Assay Heatmaps.xlsx\",\n", + " sheet_name=\"Assay and Panel information\")\n", + "\n", + "df_kinhub = pd.read_csv(\"../data/kinhub.csv\")\n", + "df_uniprot = pd.read_csv(\"../data/kinhub_uniprot.csv\")\n", + "# not currently in use\n", + "# df_klifs = pd.read_csv(\"../data/kinhub_klifs.csv\")\n", + "df_pfam = pd.read_csv(\"../data/kinhub_pfam.csv\")" + ] + }, + { + "cell_type": "markdown", + "id": "fb3641e3-0fa1-4747-843f-412dd6cc56c6", + "metadata": {}, + "source": [ + "# Extract $K_{M, ATP}$ from PKIS2 data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "6d140b9f-29c8-48e8-aaab-15b5df50a983", + "metadata": {}, + "outputs": [], + "source": [ + "df_uniprot_kinhub = df_uniprot.merge(df_kinhub, \n", + " left_on=\"uniprot_id\", \n", + " right_on=\"UniprotID\", \n", + " how=\"left\")\n", + "df_pkis_copy = df_pkis.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "37fbb0d4-b686-48fd-bba8-da7972cebfeb", + "metadata": {}, + "outputs": [], + "source": [ + "list_mut = [\n", + " \"ABL1-H396P\",\n", + " \"ABL1-M351T\",\n", + " \"ABL1-Q252H\",\n", + " \"ABL1-T315I\",\n", + " \"ABL1-Y253F\",\n", + " \"BRAF-V599E\",\n", + " \"EGFR-L858R\",\n", + " \"EGFR-L861Q\",\n", + " \"EGFR-T790M\",\n", + " \"EGFR-T790M-L858R\",\n", + " \"FLT-3-D835Y\",\n", + " \"KIT-D816V\",\n", + " \"KIT-T6701\",\n", + " \"KIT-V560G\",\n", + " \"LRRK2-G2019S\",\n", + " \"PDGFR-ALPHA-D842V\",\n", + " \"PDGFR-ALPHA-T674I\",\n", + " \"PDGFR-ALPHA-V561D\",\n", + " \"RET-V804L\",\n", + " \"RET-Y791F\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "a471f0b1-1dbe-4888-982b-9d6f83a4b8e9", + "metadata": {}, + "outputs": [], + "source": [ + "# drop lipid kinases and those whose identities we can't resolve\n", + "list_drop = [\n", + " \"PI3-KINASE-ALPHA\", # lipid kinase\n", + " \"PI3-KINASE-DELTA\", # lipid kinase\n", + " \"PI3-KINASE-GAMMA\", # lipid kinase\n", + " \"PI4-K-BETA\", # lipid kinase\n", + " \"SPHK1\", # lipid kinase\n", + " \"SPHK2\", # lipid kinase\n", + " \"AMP-A1B1G1\", # B1G1 subunits not kinase entries\n", + " \"AMP-A2B1G1\", # B1G1 subunits not kinase entries\n", + " \"CK2\", # not sure if CSNK2A1 or CSNK2A2; catalog not available\n", + " \"CDK2-CYCLINE\", # complexed with cycline - cannot account for\n", + " \"PKCB\", # not sure how this differs from PKC-BETA1\n", + " \"TRKB-L\", # not sure how this differs from TRKB\n", + " \"PKD2\", # cannot tell difference between PRKD2 using UniProt ID\n", + "]\n", + "\n", + "df_pkis_copy = df_pkis_copy.loc[~df_pkis_copy[\"Assay Name\"].isin(list_drop), ].reset_index(drop=True)\n", + "\n", + "# two minor manual adjustments causing issues\n", + "df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"].apply(lambda x: x == \"EGFR-002\"), \"Assay Name\"] = \"EGFR\"\n", + "df_pkis_copy[\"Assay Name\"] = df_pkis_copy[\"Assay Name\"].apply(lambda x: x.replace(\"ABL-\", \"ABL1-\"))\n", + "\n", + "list_assay_set = create_strsplit_list(df_pkis_copy[\"Assay Name\"].tolist())\n", + "\n", + "list_hgnc = df_uniprot_kinhub[\"hgnc_name\"].tolist()\n", + "list_xname = df_uniprot_kinhub[\"xName\"].apply(lambda x: x.split(\", \")).tolist()\n", + "list_manning = df_uniprot_kinhub[\"Manning Name\"].apply(lambda x: x.split(\", \")).tolist()\n", + "list_combo = [[x] + y + z for x, y, z in zip(list_hgnc, list_xname, list_manning)]\n", + "\n", + "# check if assay set in HGNC name, xName, or Manning Name\n", + "list_out, set_out = return_list_out(list_combo, list_assay_set)\n", + "\n", + "# [f\"{df_pkis_copy.assay_set.iloc[idx]}: {i}\" for idx, i in enumerate(list(set_out)) if len(i) > 1]\n", + "\n", + "dict_exact = {\n", + " \"ARK5\" : \"NUAK1\",\n", + " \"CK1\" : \"CK1a\",\n", + " \"CRAF\" : \"RAF1\",\n", + " \"LRRK\" : \"LRRK2\",\n", + " \"LYNA\" : \"LYN\",\n", + " \"MEK1\" : \"MAP2K1\",\n", + " \"P70S6K1\" : \"RPS6KB1\",\n", + " \"PKA\" : \"PKACa\",\n", + " \"PAR-1B-ALPHA\" : \"MARK2\",\n", + " \"PKC-BETA1\" : \"PKC-B\",\n", + " \"PRAK\" : \"MAPKAPK5\",\n", + " \"PTK5\" : \"FRK\",\n", + " \"SNF1LK2\" : \"QIK\"\n", + "}\n", + "\n", + "dict_partial = {\n", + " \"AURORA\" : \"AURK\",\n", + " \"ALPHA\" : \"A\",\n", + " \"BETA\" : \"B\",\n", + " \"DELTA\" : \"D\",\n", + " \"EPSILON\" : \"E\",\n", + " \"GAMMA\" : \"G\",\n", + " \"-ETA\" : \"H\",\n", + " \"IOTA\" : \"I\",\n", + " \"THETA\" : \"T\"\n", + "}\n", + "\n", + "list_idx_nan = [idx for idx, i in enumerate(list_out) if i is np.nan]\n", + "list_name_nan = [i if idx in list_idx_nan else np.nan for idx, i in enumerate(df_pkis_copy[\"Assay Name\"])]\n", + "list_nan_rep1 = [try_except_match_str2dict(x, dict_exact) for x in list_name_nan]\n", + "list_nan_rep2 = [replace_string_using_dict(x, dict_partial) for x in list_nan_rep1]\n", + "\n", + "list_assay_nan = create_strsplit_list(list_nan_rep2)\n", + "list_out_nan, set_out_nan = return_list_out(list_combo, list_assay_nan)\n", + "# [f\"{df_pkis_copy['Assay Name'].iloc[idx]}: {i}\" for idx, i in enumerate(list(set_out_nan)) if len(i) > 1]\n", + "list_out_nan, set_out_nan = return_list_out(list_combo, list_assay_nan)\n", + "\n", + "list_concat = [i if i is not np.nan else j for i, j in zip(list_out, list_out_nan)]\n", + "# [f\"{df_pkis_copy['Assay Name'].iloc[idx]}: {i}\" for idx, i in enumerate(list(list_concat)) if i is np.nan]\n", + "\n", + "df_pkis_copy[\"uniprot\"] = [df_uniprot_kinhub[\"uniprot_id\"].iloc[idx] \\\n", + " if idx is not np.nan else np.nan for idx in list_concat]\n", + "\n", + "# manual fix\n", + "# df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == \"PKD2\", \"uniprot\"] = \"Q13563\"\n", + "df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == \"PRKD2\", \"uniprot\"] = \"Q9BZL6\"\n", + "df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == \"RSK1\", \"uniprot\"] = \"Q15418\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4be36d49-98f0-474c-963c-9348844e35a0", + "metadata": {}, + "outputs": [], + "source": [ + "str_manual = \"P07948-2\"\n", + "\n", + "# duplicate LYN and change UniProtID to P07948-2\n", + "# df_temp = df_kinhub.loc[df_kinhub[\"HGNC Name\"] == \"LYN\", ].reset_index(drop=True)\n", + "# df_temp[\"UniprotID\"] = str_manual\n", + "# df_kinhub = pd.concat([df_kinhub, df_temp], axis=0).reset_index(drop=True)\n", + "\n", + "# manually replace LYNB in df_pkis_copy\n", + "df_pkis_copy.loc[df_pkis_copy[\"uniprot\"].isna(), \"uniprot\"] = str_manual" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "509a1eac-2364-43c7-944b-11e89783ef08", + "metadata": {}, + "outputs": [], + "source": [ + "df_pkis_copy[\"Mutant\"] = False\n", + "df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"].isin(list_mut), \"Mutant\"] = True" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d683cb59-37b7-4264-839a-89597cdd7ea4", + "metadata": {}, + "outputs": [], + "source": [ + "df_pkis_wt = df_pkis_copy.loc[~df_pkis_copy[\"Assay Name\"].isin(list_mut), ].reset_index(drop=True)\n", + "df_pkis_wt = df_pkis_wt.loc[~(\n", + " df_pkis_wt[\"uniprot\"] == str_manual), ].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0e2194e2-8a38-4e85-811f-c3d0393bb573", + "metadata": {}, + "outputs": [], + "source": [ + "dict_kinase = kinase_schema.create_kinase_models_from_df()\n", + "\n", + "dict_kinase_wt = {j.uniprot_id: j for j in dict_kinase.values() if j.uniprot_id in df_pkis_wt[\"uniprot\"].tolist()}\n", + "\n", + "dict_kinase_wt_narm = {i: j for i, j in dict_kinase_wt.items() if j.KLIFS.pocket_seq is not None}\n", + "\n", + "dict_aligned = kinase_schema.align_inter_intra_region(dict_kinase_wt_narm)\n", + "dict_replace = kinase_schema.reverse_order_dict_of_dict(dict_aligned)\n", + "\n", + "for key in dict_kinase_wt_narm.keys():\n", + " dict_kinase_wt[key].KLIFS2UniProtSeq.update(dict_replace[key])" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "738b8276-8076-44a3-91b5-b96093a54a71", + "metadata": {}, + "outputs": [], + "source": [ + "list_uniprot = []\n", + "list_kincore = []\n", + "list_klifs_min = []\n", + "list_klifs_full = []\n", + "list_group = []\n", + "\n", + "for _, row in df_pkis_wt.iterrows():\n", + " uniprot_id = row[\"uniprot\"]\n", + " if uniprot_id not in dict_kinase_wt.keys():\n", + " print(uniprot_id)\n", + " else:\n", + " temp_obj = dict_kinase_wt[uniprot_id]\n", + " list_uniprot.append(temp_obj.UniProt.canonical_seq)\n", + " if temp_obj.KinCore is None:\n", + " list_kincore.append(None)\n", + " else:\n", + " list_kincore.append(temp_obj.KinCore.seq)\n", + " list_klifs_min.append(temp_obj.KLIFS.pocket_seq)\n", + " if temp_obj.KLIFS.pocket_seq is None:\n", + " list_klifs_full.append(None)\n", + " else:\n", + " list_klifs_full.append(\"\".join([i for i in temp_obj.KLIFS2UniProtSeq.values()\\\n", + " if i is not None]))\n", + "\n", + " list_group.append([i.value for i in temp_obj.KinHub.group][0])\n", + "\n", + "df_pkis_wt[\"group\"] = list_group\n", + "df_pkis_wt[\"seq_uniprot\"] = list_uniprot\n", + "df_pkis_wt[\"seq_kincore\"] = list_kincore\n", + "df_pkis_wt[\"seq_klifs_min\"] = list_klifs_min\n", + "df_pkis_wt[\"seq_klifs_full\"] = list_klifs_full\n", + "\n", + "# uncomment to overwrite\n", + "# df_pkis_wt.to_csv(\"../data/pkis2_km_atp.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1da38c29-f5f5-4438-a49c-cba41fa5011e", + "metadata": {}, + "outputs": [], + "source": [ + "#TODO: \"P07948-2\" and mutants\n", + "#NOT IN USE\n", + "\n", + "# # extract kinase domain entries only in Pfam\n", + "# list_uniprot = df_pkis_copy.loc[~df_pkis_copy[\"uniprot\"].isna(), \"uniprot\"].tolist()\n", + "# list_kd = [\"Protein kinase domain\", \n", + "# \"Protein tyrosine and serine/threonine kinase\"]\n", + "# df_pfam_kd = df_pfam.loc[((df_pfam[\"name\"].isin(list_kd)) & \\\n", + "# (df_pfam[\"uniprot\"].isin(list_uniprot))), [\"uniprot\", \"name\", \"start\", \"end\"]]\n", + "\n", + "# # merge to add UniProt, Pfam, and KinHub annotations\n", + "# df_pkis_copy = df_pkis_copy.merge(df_uniprot, how=\"left\", left_on=\"uniprot\", right_on=\"uniprot_id\")\n", + "# df_pkis_copy = df_pkis_copy.drop(columns=[\"uniprot_id\"])\n", + "# df_pkis_copy = df_pkis_copy.merge(df_pfam_kd, how=\"left\", left_on=\"uniprot\", right_on=\"uniprot\")\n", + "# df_pkis_copy = df_pkis_copy.merge(df_kinhub, how=\"left\", left_on=\"uniprot\", right_on=\"UniprotID\")\n", + "# df_pkis_copy = df_pkis_copy.drop(columns=[\"UniprotID\"])\n", + "\n", + "# # add canonical UniProt sequence\n", + "# df_pkis_copy.loc[df_pkis_copy[\"uniprot\"] == \"P07948-2\", \"canonical_sequence\"] = uniprot.UniProt(str_manual)._sequence\n", + "# # added manually as InterPro API doesn\"t have isoform functionality\n", + "# # https://www.ebi.ac.uk/interpro/protein/reviewed/P07948/?isoform=P07948-2\n", + "# df_pkis_copy.loc[df_pkis_copy[\"uniprot\"] == \"P07948-2\", \"start\"] = 226\n", + "# df_pkis_copy.loc[df_pkis_copy[\"uniprot\"] == \"P07948-2\", \"end\"] = 480\n", + "# # df_pkis_copy.loc[df_pkis_copy[\"uniprot\"] == \"P07948-2\", ]\n", + "\n", + "# # extract kinase domain sequence\n", + "# # PDK1 has no Pfam KD\n", + "# df_pkis_copy[\"kd\"] = [row[\"canonical_sequence\"][int(row[\"start\"])-1:int(row[\"end\"])-1] \\\n", + "# for idx, row in df_pkis_copy.iterrows()]" + ] + }, + { + "cell_type": "markdown", + "id": "69fdb998-58b8-487f-89cd-63f9e4bfd93f", + "metadata": {}, + "source": [ + "# Plotting MSAs for representations" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "72508752-0c0a-4bc9-a289-43fb8fa446e5", + "metadata": {}, + "outputs": [], + "source": [ + "list_hgnc = df_pkis_wt[\"Assay Name\"].tolist()\n", + "dict_alphabet = DICT_COLORS[\"ALPHABET_PROJECT\"][\"DICT_COLORS\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "514bcaf1-514d-4ee1-a9d8-8b518853ef9a", + "metadata": {}, + "outputs": [], + "source": [ + "alignment_klifs_min = SequenceAlignment(\n", + " list_sequences=list_klifs_min,\n", + " list_ids=list_hgnc,\n", + " dict_colors=dict_alphabet\n", + ")\n", + "\n", + "alignment_klifs_min.show_plot()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9371f9c2-aa3c-4b49-82b8-7abf72336b4e", + "metadata": {}, + "outputs": [], + "source": [ + "alignment_klifs_full = SequenceAlignment(\n", + " list_sequences=list_klifs_full,\n", + " list_ids=list_hgnc,\n", + " dict_colors=dict_alphabet\n", + ")\n", + "\n", + "alignment_klifs_full.show_plot()" + ] + }, + { + "cell_type": "markdown", + "id": "7fad0ae3-70ac-4093-9503-67a236ef9bf9", + "metadata": {}, + "source": [ + "# Analyze cBioPortal data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ed5a72db-94be-4490-a90f-3d7af677955e", + "metadata": {}, + "outputs": [], + "source": [ + "def try_except_middle_int(str_in):\n", + " try:\n", + " return int(str_in[1:-1])\n", + " except ValueError:\n", + " return None\n", + "\n", + "def try_except_split(str_in, dict_in):\n", + " try:\n", + " return dict_in[str_in.split(\"_\")[0]].uniprot_id + \" \" + str_in.split(\"_\")[1]\n", + " except:\n", + " return dict_in[str_in.split(\"_\")[0]].uniprot_id" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "15e1a56a-24a2-448b-b2dc-afff79345de9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No API token provided\n" + ] + } + ], + "source": [ + "os.environ[\"CBIOPORTAL_INSTANCE\"] = \"www.cbioportal.org\"\n", + "os.environ[\"OUTPUT_DIR\"] = \".\"\n", + "study = \"msk_impact_2017\"\n", + "\n", + "df_mskimpact_muts = cbioportal.Mutations(study_id=study).get_cbioportal_cohort_mutations()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78c846e6-1ba0-4f55-99a4-aab09cda3108", + "metadata": {}, + "outputs": [], + "source": [ + "dict_kinase_wt_hgnc = {j.hgnc_name: j for j in dict_kinase_wt.values()}\n", + "\n", + "df_mskimpact_muts_pkis = df_mskimpact_muts.loc[df_mskimpact_muts[\"hugoGeneSymbol\"].isin(\\\n", + " dict_kinase_wt_hgnc.keys()), ].reset_index(drop=True)\n", + "\n", + "df_mskimpact_muts_pkis[\"residue_idx\"] = df_mskimpact_muts_pkis[\"proteinChange\"].apply(try_except_middle_int)\n", + "\n", + "df_mskimpact_muts_missense = df_mskimpact_muts_pkis.loc[((~df_mskimpact_muts_pkis[\"residue_idx\"].isna()) &\n", + " (df_mskimpact_muts_pkis[\"mutationType\"] == \"Missense_Mutation\")), ].reset_index(drop=True)\n", + "\n", + "list_temp = []\n", + "for _, row in df_mskimpact_muts_missense.iterrows():\n", + " hgnc_name = row[\"hugoGeneSymbol\"]\n", + " temp_obj = dict_kinase_wt_hgnc[hgnc_name]\n", + " list_temp.append(row[\"residue_idx\"] in temp_obj.KLIFS2UniProtIdx.values())\n", + "\n", + "df_mskimpact_muts_missense_klifs = df_mskimpact_muts_missense.loc[list_temp, ].reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "206fbb7a-fa14-4c2a-bfe7-c39bfdde7f08", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FGFR1: I575T, A\n", + "FGFR1: L645V, A\n", + "FGFR1: S628F, N\n", + "FGFR1: K573N, L\n", + "FGFR1: A639D, I\n", + "FGFR1: P514L, K\n", + "FGFR1: S549L, G\n", + "FGFR1: T485N, G\n", + "FGFR1: M566I, K\n", + "FYN: D321N, L\n", + "FYN: G354R, L\n", + "FYN: A312V, L\n", + "FYN: E332K, V\n", + "FYN: R386I, Y\n", + "FYN: D321H, L\n", + "FYN: V324I, D\n", + "NTRK2: V624M, D\n", + "NTRK2: V624M, D\n", + "NTRK2: A571G, V\n", + "NTRK2: I549M, F\n", + "NTRK2: K618R, E\n", + "NTRK2: D692N, I\n", + "NTRK2: R598H, E\n", + "NTRK2: S667L, L\n", + "NTRK2: S667L, L\n", + "NTRK2: Q675H, R\n", + "NTRK2: D600N, I\n", + "NTRK2: V681I, N\n", + "NTRK2: L670M, Q\n", + "NTRK2: L670M, Q\n", + "NTRK2: Q668L, A\n", + "NTRK2: R551Q, K\n", + "NTRK2: D627N, K\n", + "NTRK2: I554V, L\n", + "NTRK2: Q674H, H\n", + "NTRK2: I672V, F\n", + "NTRK2: N596S, Q\n", + "NTRK2: D543Y, E\n" + ] + } + ], + "source": [ + "list_protein_change = []\n", + "list_klifs_mutated = []\n", + "\n", + "for _, row in df_mskimpact_muts_missense_klifs.iterrows():\n", + " temp_obj = dict_kinase_wt_hgnc[row[\"hugoGeneSymbol\"]]\n", + " idx_klifs = [idx for idx, i in enumerate(temp_obj.KLIFS2UniProtIdx.values()) if i == row[\"proteinPosStart\"]][0]\n", + " if temp_obj.KLIFS.pocket_seq[idx_klifs] != row[\"proteinChange\"][0]:\n", + " print(f\"{temp_obj.hgnc_name}: {row['proteinChange']}, {temp_obj.KLIFS.pocket_seq[idx_klifs]}\")\n", + " else:\n", + " list_protein_change.append(f\"{temp_obj.hgnc_name}_{row['proteinChange']}\")\n", + " str_mut = temp_obj.KLIFS.pocket_seq[:idx_klifs] + row['proteinChange'][-1] + temp_obj.KLIFS.pocket_seq[idx_klifs + 1:]\n", + " list_klifs_mutated.append(str_mut)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9e4593a-6d29-4c7c-932a-d4c95f182f87", + "metadata": {}, + "outputs": [], + "source": [ + "dict_muts = dict(zip(list_protein_change, list_klifs_mutated))\n", + "\n", + "set_hgnc = set([i.split(\"_\")[0] for i in dict_muts.keys()])\n", + "\n", + "for i in set_hgnc:\n", + " dict_muts[i] = dict_kinase_wt_hgnc[i].KLIFS.pocket_seq\n", + "\n", + "df_klifs_mut_wt = pd.DataFrame({\"hgnc_name\": dict_muts.keys(),\n", + " \"klifs\": dict_muts.values()})\n", + "\n", + "df_klifs_mut_wt[\"uniprot\"] = df_klifs_mut_wt[\"hgnc_name\"].apply(\n", + " lambda x: try_except_split(x, dict_kinase_wt_hgnc))\n", + "\n", + "# uncomment to overwrite\n", + "# df_klifs_mut_wt.to_csv(\"../data/klifs_zehir_muts.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c487f05-a65a-474a-a544-b0e69236abc4", + "metadata": {}, + "outputs": [], + "source": [ + "dict_alphamissense = {i: {\"Score\": None, \"Class\": None} for i in \\\n", + " df_klifs_mut_wt.loc[df_klifs_mut_wt[\"uniprot\"].apply(\n", + " lambda x: len(x.split(\" \")) == 2), \"uniprot\"]}\n", + "\n", + "for key in tqdm(dict_alphamissense.keys(), total = len(dict_alphamissense)):\n", + " uniprot, mutant = key.split(\" \")\n", + " temp_obj = ProtvarScore(database=\"AM\", uniprot_id=uniprot, pos=mutant[1:-1], mut=mutant[-1])\n", + " if len(temp_obj._protvar_score) == 1:\n", + " dict_alphamissense[key][\"Score\"] = temp_obj._protvar_score[0][\"amPathogenicity\"]\n", + " dict_alphamissense[key][\"Class\"] = temp_obj._protvar_score[0][\"amClass\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "id": "4223aa00-a653-4e9d-858a-93a45070e007", + "metadata": {}, + "outputs": [], + "source": [ + "df_klifs_mut_wt_alphamissense = df_klifs_mut_wt.merge(pd.DataFrame(dict_alphamissense).T.reset_index(), \n", + " how=\"left\", left_on=\"uniprot\", right_on=\"index\")\n", + "\n", + "# uncomment to overwrite\n", + "# df_klifs_mut_wt_alphamissense.to_csv(\"../data/klifs_zehir_muts_alphamissense.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "7b926101-8d20-46b8-bb6d-c53a1dc32bd3", + "metadata": {}, + "source": [ + "### In use for poster only" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23006e04-2cf5-4188-9c61-e0d846a690f5", + "metadata": {}, + "outputs": [], + "source": [ + "df_alphamissense = pd.read_csv(\"../data/protvar_alphamissense.csv\", on_bad_lines='skip')\n", + "\n", + "df_alphamissense_temp = df_alphamissense[[\"User_input\", \n", + " \"AlphaMissense_pathogenicity(class)\"]].groupby(\"User_input\").agg(set)\n", + "\n", + "df_alphamissense_temp_single = df_alphamissense_temp.loc[df_alphamissense_temp[\"AlphaMissense_pathogenicity(class)\"].apply(len) == 1, ].reset_index()\n", + "df_alphamissense_temp_single[\"alphamissense\"] = df_alphamissense_temp_single[\"AlphaMissense_pathogenicity(class)\"].apply(lambda x: next(iter(x)))\n", + "df_alphamissense_temp_single = df_alphamissense_temp_single.drop(columns = [\"AlphaMissense_pathogenicity(class)\"]).reset_index(drop=True)\n", + "df_alphamissense_temp_single[\"alphamissense_score\"] = df_alphamissense_temp_single[\"alphamissense\"].apply(\n", + " lambda x: x.split(\"(\")[0])\n", + "df_alphamissense_temp_single[\"alphamissense_class\"] = df_alphamissense_temp_single[\"alphamissense\"].apply(\n", + " lambda x: x.split(\"(\")[1].replace(\")\", \"\"))\n", + "\n", + "df_alphamissense_temp_single = df_alphamissense_temp_single.drop(columns = [\"alphamissense\"]).reset_index(drop=True)\n", + "df_klifs_mut_wt_alphamissense = df_klifs_mut_wt.merge(df_alphamissense_temp_single, how=\"left\", left_on=\"uniprot\", right_on=\"User_input\")\n", + "df_klifs_mut_wt_alphamissense.to_csv(\"../data/klifs_zehir_muts_alphamissense.csv\", index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "id": "05482d8e-c589-4fd1-8096-8453d08219e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
AlphaMissense_pathogenicity(class)
User_input
P00533 G779F{0.9899(PATHOGENIC), 0.9996(PATHOGENIC), 0.999...
P00533 L747P{0.9998(PATHOGENIC), 0.969(PATHOGENIC), 0.9013...
P04626 L755A{0.9996(PATHOGENIC), 0.992(PATHOGENIC), 0.9979...
P04626 L755P{0.9996(PATHOGENIC), 0.992(PATHOGENIC), 0.9979...
P08922 E2030L{0.238(BENIGN), 0.3067(BENIGN), 0.3581(AMBIGUO...
P09619 G687K{0.9993(PATHOGENIC), nan, 0.9972(PATHOGENIC), ...
P15056 G469S{0.9999(PATHOGENIC), nan, 0.9977(PATHOGENIC), ...
P15056 L597S{0.9996(PATHOGENIC), 0.9859(PATHOGENIC), 0.993...
P23458 L910S{0.9727(PATHOGENIC), 0.9887(PATHOGENIC), 0.990...
P29320 G675K{0.6804(PATHOGENIC), 0.9965(PATHOGENIC), nan, ...
Q02750 P124M{0.9801(PATHOGENIC), 0.9669(PATHOGENIC), 0.860...
Q15303 V840S{0.9793(PATHOGENIC), 0.9996(PATHOGENIC), 0.986...
Q16288 G605L{0.9996(PATHOGENIC), 0.9998(PATHOGENIC), 0.990...
\n", + "
" + ], + "text/plain": [ + " AlphaMissense_pathogenicity(class)\n", + "User_input \n", + "P00533 G779F {0.9899(PATHOGENIC), 0.9996(PATHOGENIC), 0.999...\n", + "P00533 L747P {0.9998(PATHOGENIC), 0.969(PATHOGENIC), 0.9013...\n", + "P04626 L755A {0.9996(PATHOGENIC), 0.992(PATHOGENIC), 0.9979...\n", + "P04626 L755P {0.9996(PATHOGENIC), 0.992(PATHOGENIC), 0.9979...\n", + "P08922 E2030L {0.238(BENIGN), 0.3067(BENIGN), 0.3581(AMBIGUO...\n", + "P09619 G687K {0.9993(PATHOGENIC), nan, 0.9972(PATHOGENIC), ...\n", + "P15056 G469S {0.9999(PATHOGENIC), nan, 0.9977(PATHOGENIC), ...\n", + "P15056 L597S {0.9996(PATHOGENIC), 0.9859(PATHOGENIC), 0.993...\n", + "P23458 L910S {0.9727(PATHOGENIC), 0.9887(PATHOGENIC), 0.990...\n", + "P29320 G675K {0.6804(PATHOGENIC), 0.9965(PATHOGENIC), nan, ...\n", + "Q02750 P124M {0.9801(PATHOGENIC), 0.9669(PATHOGENIC), 0.860...\n", + "Q15303 V840S {0.9793(PATHOGENIC), 0.9996(PATHOGENIC), 0.986...\n", + "Q16288 G605L {0.9996(PATHOGENIC), 0.9998(PATHOGENIC), 0.990..." + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_alphamissense_temp.loc[df_alphamissense_temp[\"AlphaMissense_pathogenicity(class)\"].apply(len) > 1, ]" + ] + }, + { + "cell_type": "markdown", + "id": "0256727a-09e6-43e3-982b-9ae363fe1082", + "metadata": {}, + "source": [ + "# Upset plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a4dcbaa-f556-4392-80eb-a7f2a1c5412f", + "metadata": {}, + "outputs": [], + "source": [ + "list_zehir_missense_kinase = df_mskimpact_muts.loc[(\n", + " (df_mskimpact_muts[\"mutationType\"] == \"Missense_Mutation\") &\n", + " (df_mskimpact_muts[\"hugoGeneSymbol\"].isin(df_kinhub[\"HGNC Name\"]))\n", + "), \"hugoGeneSymbol\"].unique().tolist()\n", + "\n", + "list_check = [\"KinHub\", \"UniProt\", \"Pfam\", \"KLIFS\", \"KinCore\"]\n", + "list_contents = [list(key for key, val in dict_kinase.items() \\\n", + " if val.dict()[entry] is not None) for entry in list_check]\n", + "\n", + "dict_contents = dict(zip(list_check, list_contents))\n", + "dict_contents[\"cBioPortal\"] = list_zehir_missense_kinase\n", + "dict_contents[\"PKIS2\"] = df_pkis_wt[\"HGNC Name\"].unique().tolist()\n", + "\n", + "contents = from_contents(dict_contents)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "a44b777a-1b1a-4833-85f0-f5307b1b8148", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize=(12, 6))\n", + "plot(contents, fig=fig, element_size=None)\n", + "# plt.show()\n", + "plt.savefig(\"upset_plot.pdf\", bbox_inches='tight')" + ] + }, + { + "cell_type": "markdown", + "id": "b861d39a-43f7-4672-a59e-0eccce051d3f", + "metadata": {}, + "source": [ + "## DO NOT RUN" + ] + }, + { + "cell_type": "code", + "execution_count": 207, + "id": "8e82b3a2-cee8-46f6-9858-53bb4fe92af0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Assay Name MST1\n", + "ATP Conc.(uM) 50\n", + "ENZYME Conc.(nM) 0.4\n", + "Inc Time (hr) 3\n", + "SUPPLIER INVITROGEN\n", + "CATALOG# PV3854\n", + "LOT# 38395\n", + "uniprot P26927\n", + "hgnc_temp MST1\n", + "uniprot_hgnc P26927\n", + "uniprot_test Q13043\n", + "Name: 131, dtype: object\n", + "Assay Name PDK1\n", + "ATP Conc.(uM) 10\n", + "ENZYME Conc.(nM) 18.0\n", + "Inc Time (hr) 3\n", + "SUPPLIER INVITROGEN\n", + "CATALOG# PV4033\n", + "LOT# 35371B\n", + "uniprot Q15118\n", + "hgnc_temp PDK1\n", + "uniprot_hgnc Q15118\n", + "uniprot_test O15530\n", + "Name: 157, dtype: object\n", + "Assay Name PKD2\n", + "ATP Conc.(uM) 50\n", + "ENZYME Conc.(nM) 0.3\n", + "Inc Time (hr) 3\n", + "SUPPLIER UPSTATE\n", + "CATALOG# 14-506\n", + "LOT# 25273U\n", + "uniprot Q13563\n", + "hgnc_temp PKD2\n", + "uniprot_hgnc Q13563\n", + "uniprot_test Q9BZL6\n", + "Name: 171, dtype: object\n" + ] + } + ], + "source": [ + "def try_except_hgnc(x):\n", + " try:\n", + " temp = hgnc.HGNC(x).maybe_get_info_from_hgnc_fetch([\"uniprot_ids\"])[\"uniprot_ids\"][0][0]\n", + " return temp\n", + " except TypeError:\n", + " return None\n", + "\n", + "df_pkis_copy[\"hgnc_temp\"] = df_pkis_copy[\"Assay Name\"]\n", + "df_pkis_copy.loc[df_pkis_copy[\"hgnc_temp\"].isin(list_mut), \"hgnc_temp\"] = df_pkis_copy.loc[df_pkis_copy[\"hgnc_temp\"].isin(list_mut), \"hgnc_temp\"].apply(lambda x: x.split(\"-\")[0])\n", + "\n", + "df_pkis_copy[\"uniprot_hgnc\"] = df_pkis_copy[\"hgnc_temp\"].apply(lambda x: try_except_hgnc(x))\n", + "\n", + "df_pkis_copy[\"uniprot_test\"] = [df_uniprot_kinhub[\"uniprot_id\"].iloc[idx] \\\n", + " if idx is not np.nan else np.nan for idx in list_concat]\n", + "\n", + "list_uniprot = []\n", + "for _, row in df_pkis_copy.iterrows():\n", + " if (row[\"uniprot_hgnc\"] is not None) & \\\n", + " (row[\"uniprot_hgnc\"] != row[\"uniprot_test\"]):\n", + " print(row)\n", + " # list_uniprot.append(row[\"uniprot_hgnc\"])\n", + "# else:\n", + "# list_uniprot.append(row[\"uniprot_test\"])\n", + "\n", + "# df_pkis_copy[\"uniprot\"] = list_uniprot\n", + "\n", + "df_pkis_copy = df_pkis_copy.drop(columns = [\"hgnc_temp\", \"uniprot_hgnc\", \"uniprot_test\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "id": "593f01b9-aca5-474a-b166-c06ab16f31f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Assay NameATP Conc.(uM)ENZYME Conc.(nM)Inc Time (hr)SUPPLIERCATALOG#LOT#uniprot
109LYNB350.93INVITROGENP290723337NaN
\n", + "
" + ], + "text/plain": [ + " Assay Name ATP Conc.(uM) ENZYME Conc.(nM) Inc Time (hr) SUPPLIER \\\n", + "109 LYNB 35 0.9 3 INVITROGEN \n", + "\n", + " CATALOG# LOT# uniprot \n", + "109 P2907 23337 NaN " + ] + }, + "execution_count": 218, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pkis_copy.loc[~df_pkis_copy[\"uniprot\"].isin(df_uniprot_kinhub[\"uniprot_id\"].tolist()), ]" + ] + }, + { + "cell_type": "code", + "execution_count": 220, + "id": "21a07f69-46eb-4e9b-a703-dc42e0919efa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Assay NameATP Conc.(uM)ENZYME Conc.(nM)Inc Time (hr)SUPPLIERCATALOG#LOT#uniprot
30CDK2500.203UPSTATE14-44823984P24941
31CDK2-CYCLINE1000.153UPSTATE14-47522393UP24941
165PKC-BETA12000.463INVITROGENP2291299686P05771
170PKCB2000.503INVITROGENP225129433AP05771
171PKD2500.303UPSTATE14-50625273UQ9BZL6
175PRKD2200.203INVITROGENPV375834015Q9BZL6
189RSK1200.753INVITROGENPV3680386267Q15349
191RSK3500.403UPSTATE14-462D7AN006BUQ15349
208TRKB202.503INVITROGENPV361635706Q16620
209TRKB-L5005.006UPSTATE14-5071647376Q16620
\n", + "
" + ], + "text/plain": [ + " Assay Name ATP Conc.(uM) ENZYME Conc.(nM) Inc Time (hr) SUPPLIER \\\n", + "30 CDK2 50 0.20 3 UPSTATE \n", + "31 CDK2-CYCLINE 100 0.15 3 UPSTATE \n", + "165 PKC-BETA1 200 0.46 3 INVITROGEN \n", + "170 PKCB 200 0.50 3 INVITROGEN \n", + "171 PKD2 50 0.30 3 UPSTATE \n", + "175 PRKD2 20 0.20 3 INVITROGEN \n", + "189 RSK1 20 0.75 3 INVITROGEN \n", + "191 RSK3 50 0.40 3 UPSTATE \n", + "208 TRKB 20 2.50 3 INVITROGEN \n", + "209 TRKB-L 500 5.00 6 UPSTATE \n", + "\n", + " CATALOG# LOT# uniprot \n", + "30 14-448 23984 P24941 \n", + "31 14-475 22393U P24941 \n", + "165 P2291 299686 P05771 \n", + "170 P2251 29433A P05771 \n", + "171 14-506 25273U Q9BZL6 \n", + "175 PV3758 34015 Q9BZL6 \n", + "189 PV3680 386267 Q15349 \n", + "191 14-462 D7AN006BU Q15349 \n", + "208 PV3616 35706 Q16620 \n", + "209 14-507 1647376 Q16620 " + ] + }, + "execution_count": 220, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pkis_copy_no_mut = df_pkis_copy.loc[~df_pkis_copy[\"Assay Name\"].isin(list_mut), ]\n", + "uniprot_dup = df_pkis_copy_no_mut.loc[df_pkis_copy_no_mut[\"uniprot\"].duplicated(), \"uniprot\"].to_list()\n", + "df_pkis_copy_no_mut.loc[ df_pkis_copy[\"uniprot\"].isin(uniprot_dup), ]" + ] + }, + { + "cell_type": "markdown", + "id": "06573995-b450-458d-9e63-612df3d4aa00", + "metadata": {}, + "source": [ + "### Old mutant code" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "id": "74a924b1-1d52-4911-be9a-58f2a6f14eb4", + "metadata": {}, + "outputs": [], + "source": [ + "# find mutants\n", + "list_str_split = create_strsplit_list(df_pkis_copy[\"Assay Name\"], idx_start=1, idx_end=1)\n", + "list_str_split = [i[0] for i in list_str_split]\n", + "# drop if list_str_split entry is strictly numeric\n", + "list_str_split = [np.nan if type(try_except_convert_str2int(i)) == int \\\n", + " else i for i in list_str_split]\n", + "list_middle = [np.nan if i is np.nan else i[1:-1] for i in list_str_split]\n", + "\n", + "df_pkis_copy[\"Mutant\"] = False\n", + "df_pkis_copy.loc[[True if type(try_except_convert_str2int(i)) == int \\\n", + " else False for i in list_middle], \"Mutant\"] = True\n", + "\n", + "list_mut1 = create_strsplit_list(df_pkis_copy.loc[df_pkis_copy[\"Mutant\"] == True, \"Assay Name\"], 0, 0)\n", + "list_mut2 = create_strsplit_list(df_pkis_copy.loc[df_pkis_copy[\"Mutant\"] == True, \"Assay Name\"], 1, 1)\n", + "list_mut3 = create_strsplit_list(df_pkis_copy.loc[df_pkis_copy[\"Mutant\"] == True, \"Assay Name\"], 2, 2)\n", + "list_muts = [y + z if x != z else y for x, y, z in zip(list_mut1, list_mut2, list_mut3)]\n", + "\n", + "# have old notation of v599E instead of V600E\n", + "list_muts = [[\"V600E\" if i == \"V599E\" else i for i in muts] for muts in list_muts]\n", + "\n", + "dict_muts = dict(zip(df_pkis_copy.loc[df_pkis_copy[\"Mutant\"] == True, \"Assay Name\"], list_muts))\n", + "\n", + "for key, val in dict_muts.items():\n", + " temp = df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == key, \"canonical_sequence\"].values[0]\n", + " for mut in val:\n", + " aa_wt, aa_mut, codon = mut[0], mut[-1], int(mut[1:-1])-1\n", + " try:\n", + " if temp[codon] == aa_wt:\n", + " list_temp = list(temp)\n", + " list_temp[codon] = aa_mut\n", + " temp = \"\".join(list_temp)\n", + " # print(f\"{aa_wt} to {temp[codon]} at {codon} ({mut})\")\n", + " else:\n", + " print(f\"Wild-type AA at position {codon} is {temp[codon]} but expected {aa_wt} ({mut})...\")\n", + " except KeyError:\n", + " print(f\"{codon} not found in canonical {key} sequence of {len(temp)}...\")\n", + " df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == key, \"canonical_sequence\"] = temp" + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "id": "19dd91c4-0462-4fea-bdce-1b4682d56868", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ABL1-H396P\n", + "KD: 242:492\n", + "True\n", + "\n", + "ABL1-M351T\n", + "KD: 242:492\n", + "True\n", + "\n", + "ABL1-Q252H\n", + "KD: 242:492\n", + "True\n", + "\n", + "ABL1-T315I\n", + "KD: 242:492\n", + "True\n", + "\n", + "ABL1-Y253F\n", + "KD: 242:492\n", + "True\n", + "\n", + "BRAF-V600E\n", + "KD: 457:712\n", + "True\n", + "\n", + "EGFR-L858R\n", + "KD: 714:966\n", + "True\n", + "\n", + "EGFR-L861Q\n", + "KD: 714:966\n", + "True\n", + "\n", + "EGFR-T790M\n", + "KD: 714:966\n", + "True\n", + "\n", + "EGFR-T790M\n", + "KD: 714:966\n", + "True\n", + "\n", + "EGFR-L858R\n", + "KD: 714:966\n", + "True\n", + "\n", + "KIT-D816V\n", + "KD: 590:922\n", + "True\n", + "\n", + "KIT-T6701\n", + "KD: 590:922\n", + "True\n", + "\n", + "KIT-V560G\n", + "KD: 590:922\n", + "False\n", + "\n", + "LRRK2-G2019S\n", + "KD: 1885:2129\n", + "True\n", + "\n", + "RET-V804L\n", + "KD: 725:1005\n", + "True\n", + "\n", + "RET-Y791F\n", + "KD: 725:1005\n", + "True\n", + "\n" + ] + } + ], + "source": [ + "# [i.split(\"-\")[0] for i in dict_muts.keys()]\n", + "# {i.split(\"-\")[0]: j for i, j in dict_muts.items()}\n", + "for i, j in dict_muts.items():\n", + " for mut in j:\n", + " print(f\"{i.split('-')[0]}-{mut}\")\n", + " start = int(df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == i, \"start\"])\n", + " end = int(df_pkis_copy.loc[df_pkis_copy[\"Assay Name\"] == i, \"end\"])\n", + " print(f\"KD: {start}:{end}\")\n", + " print(int(mut[1:-1]) >= start and int(mut[1:-1]) <= end)\n", + " print(\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}