Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Optional brute force approach for RDF calculation #315

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 46 additions & 23 deletions src/biotite/structure/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@


def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
periodic=False):
periodic=False, use_cell_list=None):
r"""
Compute the radial distribution function *g(r)* (RDF) for one or
multiple given central positions based on a given system of
Expand Down Expand Up @@ -72,6 +72,14 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
*(m,3,3)* if atoms is an :class:`AtomArrayStack`, respectively.
periodic : bool, optional
Defines if periodic boundary conditions are taken into account.
use_cell_list : bool, optional
Whether to use a :class:`CellList` for RDF calculation or not.
A :class:`CellList` requires constant time to obtain proximate
atoms for each center, while otherwise linear time is required.
However, the creation of a :class:`CellList` has a significant
overhead if there are only a few considered centers.
By default a :class:`CellList` is not used if `center` is
smaller than 100 elements.

Returns
-------
Expand Down Expand Up @@ -189,28 +197,43 @@ def rdf(center, atoms, selection=None, interval=(0, 10), bins=100, box=None,
# Make histogram of quared distances to save computation time
# of sqrt calculation
sq_edges = edges**2
threshold_dist = edges[-1]
cell_size = threshold_dist
disp = []
for i in range(atoms.stack_depth()):
# Use cell list to efficiently preselect atoms that are in range
# of the desired bin range
cell_list = CellList(atom_coord[i], cell_size, periodic, box[i])
# 'cell_radius=1' is used in 'get_atoms_in_cells()'
# This is enough to find all atoms that are in the given
# interval (and more), since the size of each cell is as large
# as the last edge of the bins
near_atom_mask = cell_list.get_atoms_in_cells(center[i], as_mask=True)
# Calculate distances of each center to preselected atoms
# for each center
for j in range(center.shape[1]):
dist_box = box[i] if periodic else None
# Calculate squared distances
disp.append(displacement(
center[i,j], atom_coord[i, near_atom_mask[j]], box=dist_box
))
# Make one array from multiple arrays with different length
disp = np.concatenate(disp)

if use_cell_list is None:
if len(center) < 100:
use_cell_list = True
else:
use_cell_list = False

if use_cell_list:
threshold_dist = edges[-1]
cell_size = threshold_dist
disp = []
for i in range(atoms.stack_depth()):
# Use cell list to efficiently preselect atoms that are in range
# of the desired bin range
cell_list = CellList(atom_coord[i], cell_size, periodic, box[i])
# 'cell_radius=1' is used in 'get_atoms_in_cells()'
# This is enough to find all atoms that are in the given
# interval (and more), since the size of each cell is as large
# as the last edge of the bins
near_atom_mask = cell_list.get_atoms_in_cells(center[i], as_mask=True)
# Calculate distances of each center to preselected atoms
# for each center
for j in range(center.shape[1]):
dist_box = box[i] if periodic else None
# Calculate squared distances
disp.append(displacement(
center[i,j], atom_coord[i, near_atom_mask[j]], box=dist_box
))
# Make one array from multiple arrays with different length
disp = np.concatenate(disp)
else:
dist_box = box if periodic else None
disp = np.concatenate([
displacement(center[:, i ,np.newaxis], atom_coord, box=dist_box)
for i in range(center.shape[1])
], axis=1)

sq_distances = vector_dot(disp, disp)
hist, _ = np.histogram(sq_distances, bins=sq_edges)

Expand Down
106 changes: 69 additions & 37 deletions tests/structure/test_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
cannot_import("mdtraj"),
reason="MDTraj is not installed"
)
def test_rdf():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf(use_cell_list):
""" General test to reproduce oxygen RDF for a box of water"""
test_file = TEST_FILE
stack = load_structure(test_file)
Expand All @@ -25,7 +26,7 @@ def test_rdf():
interval = np.array([0, 10])
n_bins = 100
bins, g_r = rdf(oxygen[:, 0].coord, oxygen, interval=interval,
bins=n_bins, periodic=False)
bins=n_bins, periodic=False, use_cell_list=use_cell_list)

# Compare with MDTraj
import mdtraj
Expand All @@ -36,23 +37,28 @@ def test_rdf():
r_range=interval/10, n_bins=n_bins,
periodic=False)

assert np.allclose(bins, mdt_bins*10)
assert np.allclose(g_r, mdt_g_r, rtol=0.0001)
assert bins.tolist() == pytest.approx((mdt_bins*10).tolist())
assert g_r.tolist() == pytest.approx(mdt_g_r.tolist(), rel=1e-4)


def test_rdf_bins():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf_bins(use_cell_list):
""" Test if RDF produce correct bin ranges """
stack = load_structure(TEST_FILE)
center = stack[:, 0]
num_bins = 44
bin_range = (0, 11.7)
bins, g_r = rdf(center, stack, bins=num_bins, interval=bin_range)
bins, g_r = rdf(
center, stack, bins=num_bins, interval=bin_range,
use_cell_list=use_cell_list
)
assert(len(bins) == num_bins)
assert(bins[0] > bin_range[0])
assert(bins[1] < bin_range[1])


def test_rdf_with_selection():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf_with_selection(use_cell_list):
""" Test if the selection argument of rdf function works as expected """
stack = load_structure(TEST_FILE)

Expand All @@ -61,17 +67,24 @@ def test_rdf_with_selection():
interval = np.array([0, 10])
n_bins = 100
sele = (stack.atom_name == 'OW') & (stack.res_id >= 3)
bins, g_r = rdf(oxygen[:, 0].coord, stack, selection=sele,
interval=interval, bins=n_bins, periodic=False)

nosel_bins, nosel_g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:],
interval=interval, bins=n_bins, periodic=False)
bins, g_r = rdf(
oxygen[:, 0].coord, stack, selection=sele,
interval=interval, bins=n_bins, periodic=False,
use_cell_list=use_cell_list
)

nosel_bins, nosel_g_r = rdf(
oxygen[:, 0].coord, oxygen[:, 1:],
interval=interval, bins=n_bins, periodic=False,
use_cell_list=use_cell_list
)

assert np.allclose(bins, nosel_bins)
assert np.allclose(g_r, nosel_g_r)


def test_rdf_atom_argument():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf_atom_argument(use_cell_list):
""" Test if the first argument allows to use AtomArrayStack """
stack = load_structure(TEST_FILE)

Expand All @@ -80,16 +93,22 @@ def test_rdf_atom_argument():
interval = np.array([0, 10])
n_bins = 100

bins, g_r = rdf(oxygen[:, 0], stack, interval=interval,
bins=n_bins, periodic=False)
bins, g_r = rdf(
oxygen[:, 0], stack, interval=interval,
bins=n_bins, periodic=False, use_cell_list=use_cell_list
)

atom_bins, atoms_g_r = rdf(oxygen[:, 0].coord, stack, interval=interval,
bins=n_bins, periodic=False)
atom_bins, atoms_g_r = rdf(
oxygen[:, 0].coord, stack, interval=interval,
bins=n_bins, periodic=False,
use_cell_list=use_cell_list
)

assert np.allclose(g_r, atoms_g_r)


def test_rdf_multiple_center():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf_multiple_center(use_cell_list):
""" Test if the first argument allows to use multiple centers"""
stack = load_structure(TEST_FILE)

Expand All @@ -99,15 +118,21 @@ def test_rdf_multiple_center():
n_bins = 100

# averaging individual calculations
bins1, g_r1 = rdf(oxygen[:, 1].coord, oxygen[:, 2:], interval=interval,
bins=n_bins, periodic=False)
bins2, g_r2 = rdf(oxygen[:, 0].coord, oxygen[:, 2:], interval=interval,
bins=n_bins, periodic=False)
bins1, g_r1 = rdf(
oxygen[:, 1].coord, oxygen[:, 2:], interval=interval,
bins=n_bins, periodic=False, use_cell_list=use_cell_list
)
bins2, g_r2 = rdf(
oxygen[:, 0].coord, oxygen[:, 2:], interval=interval,
bins=n_bins, periodic=False, use_cell_list=use_cell_list
)
mean = np.mean([g_r1, g_r2], axis=0)

# this should give the same result as averaging for oxygen 0 and 1
bins, g_r = rdf(oxygen[:, 0:2].coord, oxygen[:, 2:], interval=interval,
bins=n_bins, periodic=False)
bins, g_r = rdf(
oxygen[:, 0:2].coord, oxygen[:, 2:], interval=interval,
bins=n_bins, periodic=False, use_cell_list=use_cell_list
)

assert np.allclose(g_r, mean, rtol=0.0001)

Expand All @@ -116,7 +141,8 @@ def test_rdf_multiple_center():
cannot_import("mdtraj"),
reason="MDTraj is not installed"
)
def test_rdf_periodic():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf_periodic(use_cell_list):
""" Test if the periodic argument gives the correct results"""
test_file = TEST_FILE
stack = load_structure(test_file)
Expand All @@ -125,8 +151,10 @@ def test_rdf_periodic():
oxygen = stack[:, stack.atom_name == 'OW']
interval = np.array([0, 10])
n_bins = 100
bins, g_r = rdf(oxygen[:, 0].coord, oxygen[:, 1:], interval=interval,
bins=n_bins, periodic=True)
bins, g_r = rdf(
oxygen[:, 0].coord, oxygen[:, 1:], interval=interval,
bins=n_bins, periodic=True, use_cell_list=use_cell_list
)

# Compare with MDTraj
import mdtraj
Expand All @@ -137,33 +165,35 @@ def test_rdf_periodic():
r_range=interval/10, n_bins=n_bins,
periodic=True)

assert np.allclose(bins, mdt_bins*10)
assert np.allclose(g_r, mdt_g_r, rtol=0.0001)
assert bins.tolist() == pytest.approx((mdt_bins*10).tolist())
assert g_r.tolist() == pytest.approx(mdt_g_r.tolist(), rel=1e-4)


def test_rdf_box():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf_box(use_cell_list):
""" Test correct use of simulation boxes """
stack = load_structure(TEST_FILE)
box = vectors_from_unitcell(1, 1, 1, 90, 90, 90)
box = np.repeat(box[np.newaxis, :, :], len(stack), axis=0)

# Use box attribute of stack
rdf(stack[:, 0], stack)
rdf(stack[:, 0], stack, use_cell_list=use_cell_list)

# Use box attribute and dont fail because stack has no box
stack.box = None
rdf(stack[:, 0], stack, box=box)
rdf(stack[:, 0], stack, box=box, use_cell_list=use_cell_list)

# Fail if no box is present
with pytest.raises(ValueError):
rdf(stack[:, 0], stack)
rdf(stack[:, 0], stack, use_cell_list=use_cell_list)

# Fail if box is of wrong size
with pytest.raises(ValueError):
rdf(stack[:, 0], stack, box=box[0])
rdf(stack[:, 0], stack, box=box[0], use_cell_list=use_cell_list)


def test_rdf_normalized():
@pytest.mark.parametrize("use_cell_list", [None, False, True])
def test_rdf_normalized(use_cell_list):
""" Assert that the RDF tail is normalized to 1"""
test_file = TEST_FILE
stack = load_structure(test_file)
Expand All @@ -173,7 +203,9 @@ def test_rdf_normalized():
interval = np.array([0, 5])
n_bins = 100

bins, g_r = rdf(oxygen.coord, oxygen, interval=interval,
bins=n_bins, periodic=True)
bins, g_r = rdf(
oxygen.coord, oxygen, interval=interval,
bins=n_bins, periodic=True, use_cell_list=use_cell_list
)
assert np.allclose(g_r[-10:], np.ones(10), atol=0.1)