Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add formalcharge attribute and read/write them from PDB #3755

Merged
merged 9 commits into from
Jul 23, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ Fixes
* add a 0.5 for correct midpoints in hole analysis (Issue #3715)

Enhancements
* Add a new `formalcharge` attribute for storing formal charges (PR #3755)
* Add the ability to read & write formal charges from PDB files (PR #3755)
* Add `norm` parameter to InterRDF, InterRDF_s to normalize as rdf,
number density or do not normalize at all. (Issue #3687)
* Additional logger.info output when per-frame analysis starts (PR #3710)
Expand Down
47 changes: 45 additions & 2 deletions package/MDAnalysis/coordinates/PDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -552,12 +552,12 @@ class PDBWriter(base.WriterBase):
"ATOM {serial:5d} {name:<4s}{altLoc:<1s}{resName:<4s}"
"{chainID:1s}{resSeq:4d}{iCode:1s}"
" {pos[0]:8.3f}{pos[1]:8.3f}{pos[2]:8.3f}{occupancy:6.2f}"
"{tempFactor:6.2f} {segID:<4s}{element:>2s}\n"),
"{tempFactor:6.2f} {segID:<4s}{element:>2s}{charge:2s}\n"),
'HETATM': (
"HETATM{serial:5d} {name:<4s}{altLoc:<1s}{resName:<4s}"
"{chainID:1s}{resSeq:4d}{iCode:1s}"
" {pos[0]:8.3f}{pos[1]:8.3f}{pos[2]:8.3f}{occupancy:6.2f}"
"{tempFactor:6.2f} {segID:<4s}{element:>2s}\n"),
"{tempFactor:6.2f} {segID:<4s}{element:>2s}{charge:2s}\n"),
'REMARK': "REMARK {0}\n",
'COMPND': "COMPND {0}\n",
'HEADER': "HEADER {0}\n",
Expand Down Expand Up @@ -1023,6 +1023,47 @@ def _deduce_PDB_atom_name(self, atomname, resname):
return '{:<4}'.format(atomname)
return ' {:<3}'.format(atomname)

@staticmethod
def _format_PDB_charges(charges: np.ndarray) -> np.ndarray:
"""Format formal charges to match PDB requirements.

Formal charge entry is set to empty if charge is 0, otherwise the
charge is set to a two character ```<charge value><charge sign>``
entry, e.g. ``1+`` or ``2-``.

This method also verifies that formal charges can adhere to the PDB
format (i.e. charge cannot be > 9 or < -9).

Parameters
----------
charges: np.ndarray
NumPy array of integers representing the formal charges of
the atoms being written.

Returns
-------
np.ndarray
NumPy array of dtype object with strings representing the
formal charges of the atoms being written.
"""
if charges.dtype != int:
IAlibay marked this conversation as resolved.
Show resolved Hide resolved
raise ValueError("formal charges array should be of `int` type")

outcharges = charges.astype(object)
outcharges[outcharges == 0] = '' # empty strings for no charge case
# using np.where is more efficient than looping in sparse cases
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're looking at ~ 4x speedup for sparse arrays (which this will always be)

for i in np.where(charges < 0)[0]:
if charges[i] < -9:
errmsg = "formal charge < -9 is not supported by PDB standard"
raise ValueError(errmsg)
outcharges[i] = f"{abs(charges[i])}-"
for i in np.where(charges > 0)[0]:
if charges[i] > 9:
errmsg = "formal charge > 9 is not supported by PDB standard"
raise ValueError(errmsg)
outcharges[i] = f"{charges[i]}+"
return outcharges

def _write_timestep(self, ts, multiframe=False):
"""Write a new timestep *ts* to file

Expand Down Expand Up @@ -1093,6 +1134,7 @@ def get_attr(attrname, default):
atomnames = get_attr('names', 'X')
elements = get_attr('elements', ' ')
record_types = get_attr('record_types', 'ATOM')
formal_charges = self._format_PDB_charges(get_attr('formalcharges', 0))

def validate_chainids(chainids, default):
"""Validate each atom's chainID
Expand Down Expand Up @@ -1158,6 +1200,7 @@ def validate_chainids(chainids, default):
vals['segID'] = segids[i][:4]
vals['chainID'] = chainids[i]
vals['element'] = elements[i][:2].upper()
vals['charge'] = formal_charges[i]

# record_type attribute, if exists, can be ATOM or HETATM
try:
Expand Down
5 changes: 5 additions & 0 deletions package/MDAnalysis/core/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -3041,6 +3041,11 @@ def select_atoms(self, sel, *othersel, periodic=True, rtol=1e-05,
``S`` will be possible options but other values will not raise
an error.

formalcharge *formal-charge*
select atoms based on their formal charge, e.g.
``name O and formalcharge -1`` to select all oxygens with a
negative 1 formal charge.

**Boolean**

not
Expand Down
12 changes: 12 additions & 0 deletions package/MDAnalysis/core/topologyattrs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2033,6 +2033,18 @@ def total_charge(group, compound='group'):
('total_charge', total_charge))


class FormalCharges(AtomAttr):
"""Formal charge on each atom"""
attrname = 'formalcharges'
singular = 'formalcharge'
per_object = 'atom'
dtype = int

@staticmethod
def _gen_initial_values(na, nr, ns):
return np.zeros(na)


# TODO: update docs to property doc
class Occupancies(AtomAttr):
attrname = 'occupancies'
Expand Down
6 changes: 6 additions & 0 deletions package/MDAnalysis/topology/ExtendedPDBParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ class ExtendedPDBParser(PDBParser.PDBParser):
- resids
- resnames
- segids
- elements
- bonds
- formalcharges

Guesses the following Attributes:
- masses

See Also
--------
Expand Down
36 changes: 33 additions & 3 deletions package/MDAnalysis/topology/PDBParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
Resnums,
Segids,
Tempfactors,
FormalCharges,
)


Expand Down Expand Up @@ -155,7 +156,7 @@ def hy36decode(width, s):
class PDBParser(TopologyReaderBase):
"""Parser that obtains a list of atoms from a standard PDB file.

Creates the following Attributes:
Creates the following Attributes (if present):
- names
- chainids
- tempfactors
Expand All @@ -166,6 +167,7 @@ class PDBParser(TopologyReaderBase):
- segids
- elements
- bonds
- formalcharges

Guesses the following Attributes:
- masses
Expand All @@ -186,6 +188,10 @@ class PDBParser(TopologyReaderBase):
are now assigned (Issue #2422).
Aliased ``bfactors`` topologyattribute to ``tempfactors``.
``bfactors`` is deprecated and will be removed in 3.0 (Issue #1901)
.. versionadded:: 2.3.0
IAlibay marked this conversation as resolved.
Show resolved Hide resolved
Formal charges are now read from PDB files if present. No formalcharge
attribute is create if no formal charges are present in the PDB file.
IAlibay marked this conversation as resolved.
Show resolved Hide resolved
Any formal charges not set are assumed to have a value of 0.
"""
format = ['PDB', 'ENT']

Expand Down Expand Up @@ -222,12 +228,11 @@ def _parseatoms(self):
icodes = []
tempfactors = []
occupancies = []

resids = []
resnames = []

segids = []
elements = []
formalcharges = []

self._wrapped_serials = False # did serials go over 100k?
last_wrapped_serial = 100000 # if serials wrap, start from here
Expand Down Expand Up @@ -260,6 +265,7 @@ def _parseatoms(self):
resnames.append(line[17:21].strip())
chainids.append(line[21:22].strip())
elements.append(line[76:78].strip())
formalcharges.append(line[78:80].strip())

# Resids are optional
try:
Expand Down Expand Up @@ -335,6 +341,30 @@ def _parseatoms(self):
validated_elements.append('')
attrs.append(Elements(np.array(validated_elements, dtype=object)))

if any(formalcharges):
for i, entry in enumerate(formalcharges):
if not entry == '':
if entry == '0':
# Technically a lack of charge shouldn't be in the PDB
# but MDA has a few files that specifically have 0
# entries, indicating that some folks interpret 0 as
# an allowed entry
formalcharges[i] = 0
elif ('+' in entry) or ('-' in entry):
try:
formalcharges[i] = int(entry[::-1])
except ValueError:
errmsg = (f"Unknown formal charge {entry} "
"encountered")
raise ValueError(errmsg)
else:
errmsg = (f"Formal charge {entry} is unrecognized")
raise ValueError(errmsg)
else:
formalcharges[i] = 0
attrs.append(
FormalCharges(np.array(formalcharges, dtype=np.int32)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above question about int vs int32

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Switching to int here since the topologyattr code will just re-type to this anyways unless dtype is set to None.

That being said, charges really shouldn't exceed the bounds of int8, so one wonders how wasteful this is... (can't remember if int32/int64 still has the weird behaviour of ending up faster than int8/int16 in some weird cases)


masses = guess_masses(atomtypes)
attrs.append(Masses(masses, guessed=True))

Expand Down
5 changes: 5 additions & 0 deletions package/doc/sphinx/source/documentation_pages/selections.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ chiral *R | S*
to select only S-chiral carbon atoms. Only ``R`` and ``S`` will be
possible options but other values will not raise an error.

formalcharge *formal-charge*
select atoms based on their formal charge, e.g.
``name O and formalcharge -1`` to select all oxygens with a
negative 1 formal charge.

Pattern matching
----------------

Expand Down
34 changes: 32 additions & 2 deletions testsuite/MDAnalysisTests/coordinates/test_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
PDB_cm, PDB_cm_gz, PDB_cm_bz2,
PDB_mc, PDB_mc_gz, PDB_mc_bz2,
PDB_CRYOEM_BOX, MMTF_NOCRYST,
PDB_HOLE, mol2_molecule)
PDB_HOLE, mol2_molecule, PDB_charges,)
from numpy.testing import (assert_equal,
assert_array_almost_equal,
assert_almost_equal)
Expand Down Expand Up @@ -1071,7 +1071,7 @@ def test_atomname_alignment(self, writtenstuff):

def test_atomtype_alignment(self, writtenstuff):
result_line = ("ATOM 1 H5T GUA X 1 7.974 6.430 9.561"
" 1.00 0.00 RNAA \n")
" 1.00 0.00 RNAA \n")
assert_equal(writtenstuff[9], result_line)


Expand Down Expand Up @@ -1273,3 +1273,33 @@ def test_cryst_meaningless_select():
u = mda.Universe(PDB_CRYOEM_BOX)
cur_sele = u.select_atoms('around 0.1 (resid 4 and name CA and segid A)')
assert cur_sele.n_atoms == 0


def test_charges_roundtrip(tmpdir):
"""
Roundtrip test for PDB formal charges reading/writing.
"""
u = mda.Universe(PDB_charges)

outfile = os.path.join(str(tmpdir), 'newcharges.pdb')
with mda.coordinates.PDB.PDBWriter(outfile) as writer:
writer.write(u.atoms)

u_written = mda.Universe(outfile)

assert_equal(u.atoms.formalcharges, u_written.atoms.formalcharges)


def test_charges_not_int():
# np.zeros yields a float64 dtype
arr = np.zeros(10)
with pytest.raises(ValueError, match="array should be of `int` type"):
mda.coordinates.PDB.PDBWriter._format_PDB_charges(arr)


@pytest.mark.parametrize('value', [99, -100])
def test_charges_limit(value):
# test for raising error when writing charges > 9
arr = np.array([0, 0, 0, value, 1, -1, 0], dtype=int)
with pytest.raises(ValueError, match="9 is not supported by PDB standard"):
mda.coordinates.PDB.PDBWriter._format_PDB_charges(arr)
15 changes: 15 additions & 0 deletions testsuite/MDAnalysisTests/core/test_atomselections.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
PDB_HOLE,
PDB_helix,
PDB_elements,
PDB_charges,
)
from MDAnalysisTests import make_Universe

Expand Down Expand Up @@ -1021,6 +1022,7 @@ def universe():
'mass :3.0',
'mass 1-',
'chirality ',
'formalcharge 0.2',
])
def test_selection_fail(self, selstr, universe):
with pytest.raises(SelectionError):
Expand Down Expand Up @@ -1457,3 +1459,16 @@ def test_chirality_selection(sel, size):
ag = u.select_atoms('chirality {}'.format(sel))

assert len(ag) == size


@pytest.mark.parametrize('sel,size,name', [
('1', 1, 'NH2'), ('-1', 1, 'OD2'), ('0', 34, 'N'), ('-1 1', 2, 'OD2'),
])
def test_formal_charge_selection(sel, size, name):
# 2 charge points, one positive one negative
u = mda.Universe(PDB_charges)

ag = u.select_atoms(f'formalcharge {sel}')

assert len(ag) == size
assert ag.atoms[0].name == name
39 changes: 39 additions & 0 deletions testsuite/MDAnalysisTests/data/charges.pdb
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
REMARK For testing reading of formal charges
REMARK This model represents an ideal PDB with valid elements and formal charges
ATOM 577 N ASP H 49 16.704 14.108 12.210 1.00 21.33 N
ATOM 578 CA ASP H 49 16.584 13.977 10.747 1.00 24.21 C
ATOM 579 C ASP H 49 17.636 13.051 10.097 1.00 24.56 C
ATOM 580 O ASP H 49 17.363 12.527 9.019 1.00 24.83 O
ATOM 581 CB ASP H 49 16.488 15.330 9.988 1.00 28.91 C
ATOM 582 CG ASP H 49 17.600 16.382 10.180 1.00 32.38 C
ATOM 583 OD1 ASP H 49 18.584 16.125 10.909 1.00 33.55 O
ATOM 584 OD2 ASP H 49 17.442 17.461 9.571 1.00 42.36 O1-
ATOM 585 H ASP H 49 16.933 15.029 12.562 1.00 21.33 H
ATOM 586 HA ASP H 49 15.631 13.477 10.577 1.00 24.21 H
ATOM 587 HB3 ASP H 49 15.557 15.811 10.288 1.00 28.91 H
ATOM 588 HB2 ASP H 49 16.375 15.140 8.919 1.00 28.91 H
ATOM 589 N ARG H 50 18.794 12.852 10.751 1.00 21.10 N
ATOM 590 CA ARG H 50 19.908 12.062 10.225 1.00 23.13 C
ATOM 591 C ARG H 50 20.342 10.898 11.121 1.00 20.79 C
ATOM 592 O ARG H 50 21.159 10.112 10.651 1.00 20.85 O
ATOM 593 CB ARG H 50 21.113 12.980 9.947 1.00 31.78 C
ATOM 594 CG ARG H 50 20.844 13.983 8.813 1.00 43.76 C
ATOM 595 CD ARG H 50 22.110 14.635 8.244 1.00 55.64 C
ATOM 596 NE ARG H 50 22.911 13.672 7.465 1.00 64.62 N
ATOM 597 CZ ARG H 50 23.874 13.946 6.570 1.00 68.53 C
ATOM 598 NH1 ARG H 50 24.232 15.199 6.258 1.00 81.88 N
ATOM 599 NH2 ARG H 50 24.498 12.931 5.973 1.00 78.51 N1+
ATOM 600 H ARG H 50 18.947 13.322 11.633 1.00 21.10 H
ATOM 601 HA ARG H 50 19.624 11.591 9.283 1.00 23.13 H
ATOM 602 HB3 ARG H 50 21.973 12.367 9.676 1.00 31.78 H
ATOM 603 HB2 ARG H 50 21.401 13.512 10.855 1.00 31.78 H
ATOM 604 HG3 ARG H 50 20.104 14.736 9.076 1.00 43.76 H
ATOM 605 HG2 ARG H 50 20.391 13.405 8.008 1.00 43.76 H
ATOM 606 HD3 ARG H 50 22.756 14.920 9.075 1.00 55.64 H
ATOM 607 HD2 ARG H 50 21.871 15.552 7.704 1.00 55.64 H
ATOM 608 HE ARG H 50 22.696 12.699 7.632 1.00 64.62 H
ATOM 609 HH12 ARG H 50 24.953 15.372 5.572 1.00 81.88 H
ATOM 610 HH11 ARG H 50 23.769 15.979 6.701 1.00 81.88 H
ATOM 611 HH22 ARG H 50 25.238 13.086 5.302 1.00 78.51 H
ATOM 612 HH21 ARG H 50 24.240 11.975 6.186 1.00 78.51 H
END
4 changes: 2 additions & 2 deletions testsuite/MDAnalysisTests/data/elements.pdb
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@ HETATM 20 Mg Mg A 4 03.000 03.000 03.000 1.00 00.00 Mg
HETATM 21 Ca Ca A 5 00.000 00.000 03.000 1.00 00.00 CA
TER 22
HETATM 1609 S DMS A 101 19.762 39.489 18.350 1.00 25.99 S
HETATM 1610 O DMS A 101 19.279 37.904 18.777 1.00 23.69 Ox
HETATM 1610 O DMS A 101 19.279 37.904 18.777 1.00 23.69 O
HETATM 1611 C1 DMS A 101 21.344 39.260 17.532 1.00 24.07 C
HETATM 1612 C2 DMS A 101 18.750 40.066 17.029 1.00 20.50 C
HETATM 1613 S DMS A 102 22.157 39.211 12.217 1.00 27.26 S1
HETATM 1613 S DMS A 102 22.157 39.211 12.217 1.00 27.26 S
HETATM 1614 O DMS A 102 20.622 38.811 11.702 1.00 25.51 O
HETATM 1615 C1 DMS A 102 22.715 40.764 11.522 1.00 26.00 C
HETATM 1616 C2 DMS A 102 22.343 39.515 13.971 1.00 25.46 C
Expand Down
4 changes: 2 additions & 2 deletions testsuite/MDAnalysisTests/datafiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@
"SDF_molecule", # MDL SDFile for rdkit
"PDBX", # PDBxfile
"PDB_elements", # PDB file with elements
"PDB_elements", # PDB file with elements
"PDB_charges", # PDB file with formal charges
"SURFACE_PDB", # 111 FCC lattice topology for NSGrid bug #2345
"SURFACE_TRR", # full precision coordinates for NSGrid bug #2345
]
Expand Down Expand Up @@ -582,7 +582,7 @@
SDF_molecule = resource_filename(__name__, 'data/molecule.sdf')

PDB_elements = resource_filename(__name__, 'data/elements.pdb')

PDB_charges = resource_filename(__name__, 'data/charges.pdb')

PDBX = resource_filename(__name__, "data/4x8u.pdbx")

Expand Down
Loading