Skip to content

Commit

Permalink
Implementation of parallelization to MDAnalysis.analysis.dssp (#4721)
Browse files Browse the repository at this point in the history
- Fixes #4674 
- Parallelization of the backend support to the class DSSP in dssp.py
- Addition of parallelization tests in test_dssp.py and fixtures in conftest.py
- Updated Changelog

---------

Co-authored-by: Yuxuan Zhuang <[email protected]>
  • Loading branch information
talagayev and yuxuanzhuang authored Oct 7, 2024
1 parent 474be5b commit 740e74e
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 22 deletions.
3 changes: 2 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ Enhancements
* explicitly mark `analysis.pca.PCA` as not parallelizable (Issue #4680)
* enables parallelization for analysis.bat.BAT (Issue #4663)
* enable parallelization for analysis.dihedrals.{Dihedral,Ramachandran,Janin}
(Issue #4673)
(Issue #4673)
* enables parallelization for analysis.dssp.dssp.DSSP (Issue #4674)
* Enables parallelization for analysis.hydrogenbonds.hbond_analysis.HydrogenBondAnalysis (Issue #4664)
* Improve error message for `AtomGroup.unwrap()` when bonds are not present.(Issue #4436, PR #4642)
* Add `analysis.DSSP` module for protein secondary structure assignment, based on [pydssp](https://github.com/ShintaroMinami/PyDSSP)
Expand Down
29 changes: 24 additions & 5 deletions package/MDAnalysis/analysis/dssp/dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@
import numpy as np
from MDAnalysis import Universe, AtomGroup

from ..base import AnalysisBase
from ..base import AnalysisBase, ResultsGroup
from ...due import due, Doi

due.cite(
Expand Down Expand Up @@ -196,7 +196,7 @@ class DSSP(AnalysisBase):
.. Warning::
For DSSP to work properly, your atoms must represent a protein. The
hydrogen atom bound to the backbone nitrogen atom is matched by name
as given by the keyword argument `hydrogen_atom`. There may only be
as given by the keyword argument `hydrogen_atom`. There may only be
a single backbone nitrogen hydrogen atom per residue; the one exception
is proline, for which there should not exist any such hydrogens.
The default value of `hydrogen_atom` should handle the common naming
Expand Down Expand Up @@ -229,8 +229,8 @@ class DSSP(AnalysisBase):
(except proline), namely the one bound to the backbone nitrogen.
.. Note::
To work with different hydrogen-naming conventions by default, the
default selection is broad but if hydrogens are incorrectly selected
To work with different hydrogen-naming conventions by default, the
default selection is broad but if hydrogens are incorrectly selected
(e.g., a :exc:`ValueError` is raised) you must customize `hydrogen_name`
for your specific case.
Expand Down Expand Up @@ -263,7 +263,7 @@ class DSSP(AnalysisBase):
The :attr:`results.dssp_ndarray` attribute holds a
``(n_frames, n_residues, 3)`` shape ndarray with a *one-hot encoding*
of *loop* '-' (index 0), *helix* 'H' (index 1), and *sheet* 'E'
(index 2), respectively for each frame of the trajectory. It can be
(index 2), respectively for each frame of the trajectory. It can be
used to compute, for instance, the **average secondary structure**:
>>> from MDAnalysis.analysis.dssp import translate, DSSP
Expand All @@ -276,8 +276,23 @@ class DSSP(AnalysisBase):
.. versionadded:: 2.8.0
Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
backends; use the new method :meth:`get_supported_backends` to see all
supported backends.
"""

_analysis_algorithm_is_parallelizable = True

@classmethod
def get_supported_backends(cls):
return (
"serial",
"multiprocessing",
"dask",
)


def __init__(
self,
atoms: Union[Universe, AtomGroup],
Expand Down Expand Up @@ -382,6 +397,10 @@ def _conclude(self):
self.results.dssp_ndarray = np.array(self.results.dssp_ndarray)
self.results.resids = self._heavy_atoms["CA"].resids

def _get_aggregator(self):
return ResultsGroup(
lookup={"dssp_ndarray": ResultsGroup.flatten_sequence},
)

def translate(onehot: np.ndarray) -> np.ndarray:
"""Translate a one-hot encoding summary into char-based secondary structure
Expand Down
11 changes: 10 additions & 1 deletion testsuite/MDAnalysisTests/analysis/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from MDAnalysis.analysis.dihedrals import Dihedral, Ramachandran, Janin
from MDAnalysis.analysis.bat import BAT
from MDAnalysis.analysis.gnm import GNMAnalysis
from MDAnalysis.analysis.dssp.dssp import DSSP
from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import (
HydrogenBondAnalysis,
)
Expand Down Expand Up @@ -127,8 +128,16 @@ def client_GNMAnalysis(request):
def client_BAT(request):
return request.param


# MDAnalysis.analysis.dssp.dssp

@pytest.fixture(scope="module", params=params_for_cls(DSSP))
def client_DSSP(request):
return request.param


# MDAnalysis.analysis.hydrogenbonds

@pytest.fixture(scope='module', params=params_for_cls(HydrogenBondAnalysis))
def client_HydrogenBondAnalysis(request):
return request.param
return request.param
31 changes: 16 additions & 15 deletions testsuite/MDAnalysisTests/analysis/test_dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,41 +10,41 @@
# Files that match glob pattern '????.pdb.gz' and matching '????.pdb.dssp' files,
# containing the secondary structure assignment string, will be tested automatically.
@pytest.mark.parametrize("pdb_filename", glob.glob(f"{DSSP_FOLDER}/?????.pdb.gz"))
def test_file_guess_hydrogens(pdb_filename):
def test_file_guess_hydrogens(pdb_filename, client_DSSP):
u = mda.Universe(pdb_filename)
with open(f"{pdb_filename.rstrip('.gz')}.dssp", "r") as fin:
correct_answ = fin.read().strip().split()[0]

run = DSSP(u, guess_hydrogens=True).run()
run = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
answ = "".join(run.results.dssp[0])
assert answ == correct_answ


def test_trajectory():
def test_trajectory(client_DSSP):
u = mda.Universe(TPR, XTC).select_atoms("protein").universe
run = DSSP(u).run(stop=10)
run = DSSP(u).run(**client_DSSP, stop=10)
first_frame = "".join(run.results.dssp[0])
last_frame = "".join(run.results.dssp[-1])
avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0)))

assert first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"
protein = mda.Universe(TPR, XTC).select_atoms("protein")
run = DSSP(protein).run(stop=10)
run = DSSP(protein).run(**client_DSSP, stop=10)


def test_atomgroup():
def test_atomgroup(client_DSSP):
protein = mda.Universe(TPR, XTC).select_atoms("protein")
run = DSSP(protein).run(stop=10)
run = DSSP(protein).run(**client_DSSP, stop=10)
first_frame = "".join(run.results.dssp[0])
last_frame = "".join(run.results.dssp[-1])
avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0)))

assert first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"


def test_trajectory_with_hydrogens():
def test_trajectory_with_hydrogens(client_DSSP):
u = mda.Universe(TPR, XTC).select_atoms("protein").universe
run = DSSP(u, guess_hydrogens=False).run(stop=10)
run = DSSP(u, guess_hydrogens=False).run(**client_DSSP, stop=10)
first_frame = "".join(run.results.dssp[0])
last_frame = "".join(run.results.dssp[-1])
avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0)))
Expand All @@ -53,28 +53,29 @@ def test_trajectory_with_hydrogens():


@pytest.mark.parametrize("pdb_filename", glob.glob(f"{DSSP_FOLDER}/2xdgA.pdb.gz"))
def test_trajectory_without_hydrogen_fails(pdb_filename):
def test_trajectory_without_hydrogen_fails(pdb_filename, client_DSSP):
u = mda.Universe(pdb_filename)
with pytest.raises(ValueError):
DSSP(u, guess_hydrogens=False).run()
DSSP(u, guess_hydrogens=False).run(**client_DSSP)


@pytest.mark.parametrize(
"pdb_filename", glob.glob(f"{DSSP_FOLDER}/1mr1D_failing.pdb.gz")
)
def test_trajectory_with_uneven_number_of_atoms_fails(pdb_filename):
def test_trajectory_with_uneven_number_of_atoms_fails(pdb_filename,
client_DSSP):
u = mda.Universe(pdb_filename)
with pytest.raises(ValueError):
DSSP(u, guess_hydrogens=True).run()
DSSP(u, guess_hydrogens=True).run(**client_DSSP)


@pytest.mark.parametrize(
"pdb_filename", glob.glob(f"{DSSP_FOLDER}/wrong_hydrogens.pdb.gz")
)
def test_exception_raises_with_atom_index(pdb_filename):
def test_exception_raises_with_atom_index(pdb_filename, client_DSSP):
u = mda.Universe(pdb_filename)
with pytest.raises(
ValueError,
match="Residue <Residue SER, 298> contains*",
):
DSSP(u, guess_hydrogens=False).run()
DSSP(u, guess_hydrogens=False).run(**client_DSSP)

0 comments on commit 740e74e

Please sign in to comment.