Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementation of parallelization to MDAnalysis.analysis.dssp #4721

Merged
merged 12 commits into from
Oct 7, 2024
3 changes: 2 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ Enhancements
* explicitly mark `analysis.pca.PCA` as not parallelizable (Issue #4680)
* enables parallelization for analysis.bat.BAT (Issue #4663)
* enable parallelization for analysis.dihedrals.{Dihedral,Ramachandran,Janin}
(Issue #4673)
(Issue #4673)
* enables parallelization for analysis.dssp.dssp.DSSP (Issue #4674)
* Enables parallelization for analysis.hydrogenbonds.hbond_analysis.HydrogenBondAnalysis (Issue #4664)
* Improve error message for `AtomGroup.unwrap()` when bonds are not present.(Issue #4436, PR #4642)
* Add `analysis.DSSP` module for protein secondary structure assignment, based on [pydssp](https://github.com/ShintaroMinami/PyDSSP)
Expand Down
29 changes: 24 additions & 5 deletions package/MDAnalysis/analysis/dssp/dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@
import numpy as np
from MDAnalysis import Universe, AtomGroup

from ..base import AnalysisBase
from ..base import AnalysisBase, ResultsGroup
from ...due import due, Doi

due.cite(
Expand Down Expand Up @@ -196,7 +196,7 @@ class DSSP(AnalysisBase):
.. Warning::
For DSSP to work properly, your atoms must represent a protein. The
hydrogen atom bound to the backbone nitrogen atom is matched by name
as given by the keyword argument `hydrogen_atom`. There may only be
as given by the keyword argument `hydrogen_atom`. There may only be
a single backbone nitrogen hydrogen atom per residue; the one exception
is proline, for which there should not exist any such hydrogens.
The default value of `hydrogen_atom` should handle the common naming
Expand Down Expand Up @@ -229,8 +229,8 @@ class DSSP(AnalysisBase):
(except proline), namely the one bound to the backbone nitrogen.

.. Note::
To work with different hydrogen-naming conventions by default, the
default selection is broad but if hydrogens are incorrectly selected
To work with different hydrogen-naming conventions by default, the
default selection is broad but if hydrogens are incorrectly selected
(e.g., a :exc:`ValueError` is raised) you must customize `hydrogen_name`
for your specific case.

Expand Down Expand Up @@ -263,7 +263,7 @@ class DSSP(AnalysisBase):
The :attr:`results.dssp_ndarray` attribute holds a
``(n_frames, n_residues, 3)`` shape ndarray with a *one-hot encoding*
of *loop* '-' (index 0), *helix* 'H' (index 1), and *sheet* 'E'
(index 2), respectively for each frame of the trajectory. It can be
(index 2), respectively for each frame of the trajectory. It can be
used to compute, for instance, the **average secondary structure**:

>>> from MDAnalysis.analysis.dssp import translate, DSSP
Expand All @@ -276,8 +276,23 @@ class DSSP(AnalysisBase):


.. versionadded:: 2.8.0

Enabled **parallel execution** with the ``multiprocessing`` and ``dask``
backends; use the new method :meth:`get_supported_backends` to see all
supported backends.
"""

_analysis_algorithm_is_parallelizable = True

@classmethod
def get_supported_backends(cls):
return (
"serial",
"multiprocessing",
"dask",
)


def __init__(
self,
atoms: Union[Universe, AtomGroup],
Expand Down Expand Up @@ -382,6 +397,10 @@ def _conclude(self):
self.results.dssp_ndarray = np.array(self.results.dssp_ndarray)
self.results.resids = self._heavy_atoms["CA"].resids

def _get_aggregator(self):
return ResultsGroup(
lookup={"dssp_ndarray": ResultsGroup.flatten_sequence},
)

def translate(onehot: np.ndarray) -> np.ndarray:
"""Translate a one-hot encoding summary into char-based secondary structure
Expand Down
11 changes: 10 additions & 1 deletion testsuite/MDAnalysisTests/analysis/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from MDAnalysis.analysis.dihedrals import Dihedral, Ramachandran, Janin
from MDAnalysis.analysis.bat import BAT
from MDAnalysis.analysis.gnm import GNMAnalysis
from MDAnalysis.analysis.dssp.dssp import DSSP
from MDAnalysis.analysis.hydrogenbonds.hbond_analysis import (
HydrogenBondAnalysis,
)
Expand Down Expand Up @@ -127,8 +128,16 @@ def client_GNMAnalysis(request):
def client_BAT(request):
return request.param


# MDAnalysis.analysis.dssp.dssp

@pytest.fixture(scope="module", params=params_for_cls(DSSP))
def client_DSSP(request):
return request.param


# MDAnalysis.analysis.hydrogenbonds

@pytest.fixture(scope='module', params=params_for_cls(HydrogenBondAnalysis))
def client_HydrogenBondAnalysis(request):
return request.param
return request.param
31 changes: 16 additions & 15 deletions testsuite/MDAnalysisTests/analysis/test_dssp.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,41 +10,41 @@
# Files that match glob pattern '????.pdb.gz' and matching '????.pdb.dssp' files,
# containing the secondary structure assignment string, will be tested automatically.
@pytest.mark.parametrize("pdb_filename", glob.glob(f"{DSSP_FOLDER}/?????.pdb.gz"))
def test_file_guess_hydrogens(pdb_filename):
def test_file_guess_hydrogens(pdb_filename, client_DSSP):
u = mda.Universe(pdb_filename)
with open(f"{pdb_filename.rstrip('.gz')}.dssp", "r") as fin:
correct_answ = fin.read().strip().split()[0]

run = DSSP(u, guess_hydrogens=True).run()
run = DSSP(u, guess_hydrogens=True).run(**client_DSSP)
answ = "".join(run.results.dssp[0])
assert answ == correct_answ


def test_trajectory():
def test_trajectory(client_DSSP):
u = mda.Universe(TPR, XTC).select_atoms("protein").universe
run = DSSP(u).run(stop=10)
run = DSSP(u).run(**client_DSSP, stop=10)
first_frame = "".join(run.results.dssp[0])
last_frame = "".join(run.results.dssp[-1])
avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0)))

assert first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"
protein = mda.Universe(TPR, XTC).select_atoms("protein")
run = DSSP(protein).run(stop=10)
run = DSSP(protein).run(**client_DSSP, stop=10)


def test_atomgroup():
def test_atomgroup(client_DSSP):
protein = mda.Universe(TPR, XTC).select_atoms("protein")
run = DSSP(protein).run(stop=10)
run = DSSP(protein).run(**client_DSSP, stop=10)
first_frame = "".join(run.results.dssp[0])
last_frame = "".join(run.results.dssp[-1])
avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0)))

assert first_frame[:10] != last_frame[:10] == avg_frame[:10] == "-EEEEEE---"


def test_trajectory_with_hydrogens():
def test_trajectory_with_hydrogens(client_DSSP):
u = mda.Universe(TPR, XTC).select_atoms("protein").universe
run = DSSP(u, guess_hydrogens=False).run(stop=10)
run = DSSP(u, guess_hydrogens=False).run(**client_DSSP, stop=10)
first_frame = "".join(run.results.dssp[0])
last_frame = "".join(run.results.dssp[-1])
avg_frame = "".join(translate(run.results.dssp_ndarray.mean(axis=0)))
Expand All @@ -53,28 +53,29 @@ def test_trajectory_with_hydrogens():


@pytest.mark.parametrize("pdb_filename", glob.glob(f"{DSSP_FOLDER}/2xdgA.pdb.gz"))
def test_trajectory_without_hydrogen_fails(pdb_filename):
def test_trajectory_without_hydrogen_fails(pdb_filename, client_DSSP):
u = mda.Universe(pdb_filename)
with pytest.raises(ValueError):
DSSP(u, guess_hydrogens=False).run()
DSSP(u, guess_hydrogens=False).run(**client_DSSP)


@pytest.mark.parametrize(
"pdb_filename", glob.glob(f"{DSSP_FOLDER}/1mr1D_failing.pdb.gz")
)
def test_trajectory_with_uneven_number_of_atoms_fails(pdb_filename):
def test_trajectory_with_uneven_number_of_atoms_fails(pdb_filename,
client_DSSP):
u = mda.Universe(pdb_filename)
with pytest.raises(ValueError):
DSSP(u, guess_hydrogens=True).run()
DSSP(u, guess_hydrogens=True).run(**client_DSSP)


@pytest.mark.parametrize(
"pdb_filename", glob.glob(f"{DSSP_FOLDER}/wrong_hydrogens.pdb.gz")
)
def test_exception_raises_with_atom_index(pdb_filename):
def test_exception_raises_with_atom_index(pdb_filename, client_DSSP):
u = mda.Universe(pdb_filename)
with pytest.raises(
ValueError,
match="Residue <Residue SER, 298> contains*",
):
DSSP(u, guess_hydrogens=False).run()
DSSP(u, guess_hydrogens=False).run(**client_DSSP)
Loading