From d494592cae3f42d98fbdd181176c085337d1abdb Mon Sep 17 00:00:00 2001 From: a-r-j Date: Wed, 27 Jul 2022 14:04:20 +0200 Subject: [PATCH 01/12] correct deprotonation selection #198 --- graphein/protein/graphs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/protein/graphs.py b/graphein/protein/graphs.py index da278559..3ece7d36 100644 --- a/graphein/protein/graphs.py +++ b/graphein/protein/graphs.py @@ -149,7 +149,7 @@ def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame: "Deprotonating protein. This removes H atoms from the pdb_df dataframe" ) return filter_dataframe( - df, by_column="atom_name", list_of_values=["H"], boolean=False + df, by_column="element_symbol", list_of_values=["H"], boolean=False ) From 95735aab84e36323d2d959a942980140c52de570 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Wed, 27 Jul 2022 14:06:07 +0200 Subject: [PATCH 02/12] Add MSE to constants #200 --- graphein/protein/resi_atoms.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py index 560acd49..1e210f61 100644 --- a/graphein/protein/resi_atoms.py +++ b/graphein/protein/resi_atoms.py @@ -338,6 +338,7 @@ "LYS", "MET", "MLE", + "MSE", "MVA", "NH2", "NLE", @@ -434,6 +435,7 @@ "LYS": "K", "MET": "M", "MLE": "L", + "MSE": "M" "MVA": "V", "NH2": "X", "NLE": "L", From 409e515697b386bcb291866722400189934bae02 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Wed, 27 Jul 2022 14:07:37 +0200 Subject: [PATCH 03/12] fix chain id check in atomic edges #199 --- graphein/protein/edges/atomic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/protein/edges/atomic.py b/graphein/protein/edges/atomic.py index d5b76f8f..6bd41b94 100644 --- a/graphein/protein/edges/atomic.py +++ b/graphein/protein/edges/atomic.py @@ -134,7 +134,7 @@ def add_atomic_edges(G: nx.Graph, tolerance: float = 0.56) -> nx.Graph: continue # Check atoms are in the same chain - if not (chain_1 and chain_2): + if not (chain_1 == chain_2): continue if G.has_edge(node_1, node_2): From 542b3c0a8ca196a31e63faf6f05d5e2fb35dad6c Mon Sep 17 00:00:00 2001 From: a-r-j Date: Wed, 27 Jul 2022 14:09:47 +0200 Subject: [PATCH 04/12] missing comma --- graphein/protein/resi_atoms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/protein/resi_atoms.py b/graphein/protein/resi_atoms.py index 1e210f61..b2244035 100644 --- a/graphein/protein/resi_atoms.py +++ b/graphein/protein/resi_atoms.py @@ -435,7 +435,7 @@ "LYS": "K", "MET": "M", "MLE": "L", - "MSE": "M" + "MSE": "M", "MVA": "V", "NH2": "X", "NLE": "L", From 3ee5d33c4fefded1486c4d48d1b4e0e0353d2c62 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Wed, 27 Jul 2022 14:20:24 +0200 Subject: [PATCH 05/12] fix code smell --- graphein/protein/edges/atomic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/protein/edges/atomic.py b/graphein/protein/edges/atomic.py index 6bd41b94..a98ae694 100644 --- a/graphein/protein/edges/atomic.py +++ b/graphein/protein/edges/atomic.py @@ -134,7 +134,7 @@ def add_atomic_edges(G: nx.Graph, tolerance: float = 0.56) -> nx.Graph: continue # Check atoms are in the same chain - if not (chain_1 == chain_2): + if chain_1 != chain_2: continue if G.has_edge(node_1, node_2): From 9ab56e7be1aa41cb2117efcb2a14b4ef119b5173 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Mon, 1 Aug 2022 16:23:33 +0200 Subject: [PATCH 06/12] black --- graphein/ml/datasets/torch_geometric_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index 9f464fd7..5d23ac56 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -416,7 +416,7 @@ def __init__( else: self.graph_label_map = None self.validate_input() - self.bad_pdbs: List[str] = [] + self.bad_pdbs: List[str] = [] # Configs self.config = graphein_config From d76838d8a973b0206a7e1f326d87d1f807bd4505 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Mon, 1 Aug 2022 16:48:49 +0200 Subject: [PATCH 07/12] fix input validation --- .../ml/datasets/torch_geometric_dataset.py | 37 ++++++++++--------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index 5d23ac56..790e14e1 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -451,23 +451,26 @@ def processed_file_names(self) -> List[str]: return [f"{pdb}.pt" for pdb in self.structures] def validate_input(self): - assert len(self.structures) == len( - self.graph_label_map - ), "Number of proteins and graph labels must match" - assert len(self.structures) == len( - self.node_label_map - ), "Number of proteins and node labels must match" - assert len(self.structures) == len( - self.chain_selection_map - ), "Number of proteins and chain selections must match" - assert len( - { - f"{pdb}_{chain}" - for pdb, chain in zip( - self.structures, self.chain_selection_map - ) - } - ) == len(self.structures), "Duplicate protein/chain combinations" + if self.graph_label_map is not None: + assert len(self.structures) == len( + self.graph_label_map + ), "Number of proteins and graph labels must match" + if self.node_label_map is not None: + assert len(self.structures) == len( + self.node_label_map + ), "Number of proteins and node labels must match" + if self.chain_selection_map is not None: + assert len(self.structures) == len( + self.chain_selection_map + ), "Number of proteins and chain selections must match" + assert len( + { + f"{pdb}_{chain}" + for pdb, chain in zip( + self.structures, self.chain_selection_map + ) + } + ) == len(self.structures), "Duplicate protein/chain combinations" def download(self): """Download the PDB files from RCSB or Alphafold.""" From 323ec260d4cb00350d0eac289f09a65e86450e5e Mon Sep 17 00:00:00 2001 From: a-r-j Date: Mon, 1 Aug 2022 18:48:41 +0200 Subject: [PATCH 08/12] fix input validation --- graphein/ml/datasets/torch_geometric_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index 790e14e1..be2258cf 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -451,15 +451,15 @@ def processed_file_names(self) -> List[str]: return [f"{pdb}.pt" for pdb in self.structures] def validate_input(self): - if self.graph_label_map is not None: + if hasattr(self, "graph_label_map"): assert len(self.structures) == len( self.graph_label_map ), "Number of proteins and graph labels must match" - if self.node_label_map is not None: + if hasattr(self, "node_label_map"): assert len(self.structures) == len( self.node_label_map ), "Number of proteins and node labels must match" - if self.chain_selection_map is not None: + if hasattr(self, "chain_selection_map") assert len(self.structures) == len( self.chain_selection_map ), "Number of proteins and chain selections must match" From 5ea593d7a61d4c2fcd98fe6f01a04f6f1ad0c850 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Mon, 1 Aug 2022 18:52:23 +0200 Subject: [PATCH 09/12] fix input validation --- graphein/ml/datasets/torch_geometric_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index be2258cf..6b047e27 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -459,7 +459,7 @@ def validate_input(self): assert len(self.structures) == len( self.node_label_map ), "Number of proteins and node labels must match" - if hasattr(self, "chain_selection_map") + if hasattr(self, "chain_selection_map"): assert len(self.structures) == len( self.chain_selection_map ), "Number of proteins and chain selections must match" From 7addacc8bbacb455b5f7cd803f06c687e4507762 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Mon, 1 Aug 2022 20:33:07 +0200 Subject: [PATCH 10/12] fix input validation --- graphein/ml/datasets/torch_geometric_dataset.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index 6b047e27..d4555513 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -414,7 +414,7 @@ def __init__( if chain_selections is not None: self.chain_selection_map = dict(enumerate(chain_selections)) else: - self.graph_label_map = None + self.chain_selection_map = None self.validate_input() self.bad_pdbs: List[str] = [] @@ -451,15 +451,15 @@ def processed_file_names(self) -> List[str]: return [f"{pdb}.pt" for pdb in self.structures] def validate_input(self): - if hasattr(self, "graph_label_map"): + if self.graph_label_map is not None: assert len(self.structures) == len( self.graph_label_map ), "Number of proteins and graph labels must match" - if hasattr(self, "node_label_map"): + if self.node_label_map is not None: assert len(self.structures) == len( self.node_label_map ), "Number of proteins and node labels must match" - if hasattr(self, "chain_selection_map"): + if self.chain_selection_map is not None: assert len(self.structures) == len( self.chain_selection_map ), "Number of proteins and chain selections must match" From 0ac163acc0bfca28e8a3c6a63a947bd3b62bb878 Mon Sep 17 00:00:00 2001 From: a-r-j Date: Tue, 2 Aug 2022 00:25:11 +0200 Subject: [PATCH 11/12] fix input validation --- .../ml/datasets/torch_geometric_dataset.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/graphein/ml/datasets/torch_geometric_dataset.py b/graphein/ml/datasets/torch_geometric_dataset.py index d4555513..3665918f 100644 --- a/graphein/ml/datasets/torch_geometric_dataset.py +++ b/graphein/ml/datasets/torch_geometric_dataset.py @@ -9,7 +9,7 @@ import logging as log import os from pathlib import Path -from typing import Callable, Dict, List, Optional +from typing import Callable, Dict, Generator, List, Optional import networkx as nx from tqdm import tqdm @@ -533,7 +533,7 @@ def process(self): # Chunk dataset for parallel processing chunk_size = 128 - def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]: + def divide_chunks(l: List[str], n: int = 2) -> Generator: for i in range(0, len(l), n): yield l[i : i + n] @@ -587,12 +587,16 @@ def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]: data_list = [self.pre_transform(data) for data in data_list] for i, (pdb, chain) in enumerate(zip(pdbs, chain_selections)): - - torch.save( - data_list[i], - os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"), - ) - idx += 1 + if self.chain_selection_map is None: + torch.save( + data_list[i], + os.path.join(self.processed_dir, f"{pdb}.pt"), + ) + else: + torch.save( + data_list[i], + os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"), + ) def get(self, idx: int): """ From 4d3684f1d9a361ab653b41e9cbbbdf87b9b86adb Mon Sep 17 00:00:00 2001 From: a-r-j Date: Tue, 2 Aug 2022 01:14:20 +0200 Subject: [PATCH 12/12] update changelog --- CHANGELOG.md | 5 ++++- docs/source/conf.py | 2 +- graphein/__init__.py | 2 +- setup.py | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c2f1f7d..80ccadf3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,12 +12,15 @@ * [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes. * [Patch] - [#189](https://github.com/a-r-j/graphein/pull/189) fixes bug where chains and PDB identifiers were not properly aligned in `ml.ProteinGraphDataset`. +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Adds missing `MSE` to `graphein.protein.resi_atoms.RESI_NAMES`, `graphein.protein.resi_atoms.RESI_THREE_TO_1`. [#200](https://github.com/a-r-j/graphein/issues/200) +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where check for same-chain always evaluates as False. [#199](https://github.com/a-r-j/graphein/issues/199) +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where deprotonation would only remove hydrogens based on `atom_name` rather than `element_symbol`. [#198](https://github.com/a-r-j/graphein/issues/198) +* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug in ProteinGraphDataset input validation. #### Breaking Changes * [#189](https://github.com/a-r-j/graphein/pull/189/) refactors PDB download util. Now returns path to download file, does not accept a config object but instead receives the output directory path directly. - ### 1.5.0 #### Protein diff --git a/docs/source/conf.py b/docs/source/conf.py index 2dd8f823..f3a0a9ff 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -29,7 +29,7 @@ author = "Arian Jamasb" # The full version, including alpha/beta/rc tags -release = "1.5.0" +release = "1.5.1" # -- General configuration --------------------------------------------------- diff --git a/graphein/__init__.py b/graphein/__init__.py index 8b58fc84..97a72298 100644 --- a/graphein/__init__.py +++ b/graphein/__init__.py @@ -12,7 +12,7 @@ from .testing import * __author__ = "Arian Jamasb " -__version__ = "1.5.0" +__version__ = "1.5.1" logger.configure( diff --git a/setup.py b/setup.py index c20c0416..df550f23 100644 --- a/setup.py +++ b/setup.py @@ -135,7 +135,7 @@ def run(self): setup( name="graphein", - version="1.5.0", + version="1.5.1", # versioneer.get_version(), # cmdclass=versioneer.get_cmdclass(), description="Protein & Interactomic Graph Construction for Machine Learning",