Skip to content

Commit

Permalink
Bugfixes #198 #199 #200 (#201)
Browse files Browse the repository at this point in the history
* correct deprotonation selection #198

* Add MSE to constants #200

* fix chain id check in atomic edges #199

* missing comma

* fix code smell

* black

* fix input validation

* fix input validation

* fix input validation

* fix input validation

* fix input validation

* update changelog
  • Loading branch information
a-r-j authored Aug 1, 2022
1 parent bb4ba76 commit c07fc78
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 33 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@

* [Patch] - [#187](https://github.com/a-r-j/graphein/pull/187) updates sequence retrieval due to UniProt API changes.
* [Patch] - [#189](https://github.com/a-r-j/graphein/pull/189) fixes bug where chains and PDB identifiers were not properly aligned in `ml.ProteinGraphDataset`.
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Adds missing `MSE` to `graphein.protein.resi_atoms.RESI_NAMES`, `graphein.protein.resi_atoms.RESI_THREE_TO_1`. [#200](https://github.com/a-r-j/graphein/issues/200)
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where check for same-chain always evaluates as False. [#199](https://github.com/a-r-j/graphein/issues/199)
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug where deprotonation would only remove hydrogens based on `atom_name` rather than `element_symbol`. [#198](https://github.com/a-r-j/graphein/issues/198)
* [Patch] - [#201](https://github.com/a-r-j/graphein/pull/201) Fixes bug in ProteinGraphDataset input validation.

#### Breaking Changes

* [#189](https://github.com/a-r-j/graphein/pull/189/) refactors PDB download util. Now returns path to download file, does not accept a config object but instead receives the output directory path directly.


### 1.5.0

#### Protein
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
author = "Arian Jamasb"

# The full version, including alpha/beta/rc tags
release = "1.5.0"
release = "1.5.1"


# -- General configuration ---------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion graphein/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from .testing import *

__author__ = "Arian Jamasb <[email protected]>"
__version__ = "1.5.0"
__version__ = "1.5.1"


logger.configure(
Expand Down
61 changes: 34 additions & 27 deletions graphein/ml/datasets/torch_geometric_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging as log
import os
from pathlib import Path
from typing import Callable, Dict, List, Optional
from typing import Callable, Dict, Generator, List, Optional

import networkx as nx
from tqdm import tqdm
Expand Down Expand Up @@ -414,9 +414,9 @@ def __init__(
if chain_selections is not None:
self.chain_selection_map = dict(enumerate(chain_selections))
else:
self.graph_label_map = None
self.chain_selection_map = None
self.validate_input()
self.bad_pdbs: List[str] = []
self.bad_pdbs: List[str] = []

# Configs
self.config = graphein_config
Expand Down Expand Up @@ -451,23 +451,26 @@ def processed_file_names(self) -> List[str]:
return [f"{pdb}.pt" for pdb in self.structures]

def validate_input(self):
assert len(self.structures) == len(
self.graph_label_map
), "Number of proteins and graph labels must match"
assert len(self.structures) == len(
self.node_label_map
), "Number of proteins and node labels must match"
assert len(self.structures) == len(
self.chain_selection_map
), "Number of proteins and chain selections must match"
assert len(
{
f"{pdb}_{chain}"
for pdb, chain in zip(
self.structures, self.chain_selection_map
)
}
) == len(self.structures), "Duplicate protein/chain combinations"
if self.graph_label_map is not None:
assert len(self.structures) == len(
self.graph_label_map
), "Number of proteins and graph labels must match"
if self.node_label_map is not None:
assert len(self.structures) == len(
self.node_label_map
), "Number of proteins and node labels must match"
if self.chain_selection_map is not None:
assert len(self.structures) == len(
self.chain_selection_map
), "Number of proteins and chain selections must match"
assert len(
{
f"{pdb}_{chain}"
for pdb, chain in zip(
self.structures, self.chain_selection_map
)
}
) == len(self.structures), "Duplicate protein/chain combinations"

def download(self):
"""Download the PDB files from RCSB or Alphafold."""
Expand Down Expand Up @@ -530,7 +533,7 @@ def process(self):
# Chunk dataset for parallel processing
chunk_size = 128

def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
def divide_chunks(l: List[str], n: int = 2) -> Generator:
for i in range(0, len(l), n):
yield l[i : i + n]

Expand Down Expand Up @@ -584,12 +587,16 @@ def divide_chunks(l: List[str], n: int = 2) -> List[List[str]]:
data_list = [self.pre_transform(data) for data in data_list]

for i, (pdb, chain) in enumerate(zip(pdbs, chain_selections)):

torch.save(
data_list[i],
os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"),
)
idx += 1
if self.chain_selection_map is None:
torch.save(
data_list[i],
os.path.join(self.processed_dir, f"{pdb}.pt"),
)
else:
torch.save(
data_list[i],
os.path.join(self.processed_dir, f"{pdb}_{chain}.pt"),
)

def get(self, idx: int):
"""
Expand Down
2 changes: 1 addition & 1 deletion graphein/protein/edges/atomic.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def add_atomic_edges(G: nx.Graph, tolerance: float = 0.56) -> nx.Graph:
continue

# Check atoms are in the same chain
if not (chain_1 and chain_2):
if chain_1 != chain_2:
continue

if G.has_edge(node_1, node_2):
Expand Down
2 changes: 1 addition & 1 deletion graphein/protein/graphs.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def deprotonate_structure(df: pd.DataFrame) -> pd.DataFrame:
"Deprotonating protein. This removes H atoms from the pdb_df dataframe"
)
return filter_dataframe(
df, by_column="atom_name", list_of_values=["H"], boolean=False
df, by_column="element_symbol", list_of_values=["H"], boolean=False
)


Expand Down
2 changes: 2 additions & 0 deletions graphein/protein/resi_atoms.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,7 @@
"LYS",
"MET",
"MLE",
"MSE",
"MVA",
"NH2",
"NLE",
Expand Down Expand Up @@ -434,6 +435,7 @@
"LYS": "K",
"MET": "M",
"MLE": "L",
"MSE": "M",
"MVA": "V",
"NH2": "X",
"NLE": "L",
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def run(self):

setup(
name="graphein",
version="1.5.0",
version="1.5.1",
# versioneer.get_version(),
# cmdclass=versioneer.get_cmdclass(),
description="Protein & Interactomic Graph Construction for Machine Learning",
Expand Down

0 comments on commit c07fc78

Please sign in to comment.