Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce Capped distances to evaluate the indices of pairs within a fixed radius #1941

Merged
merged 21 commits into from
Jul 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,14 @@ The rules for this file:

------------------------------------------------------------------------------
??/??/18 tylerjereddy, richardjgowers, palnabarun, orbeckst, kain88-de, zemanj,
VOD555, davidercruz, jbarnoud
VOD555, davidercruz, jbarnoud, ayushsuhane

* 0.18.1

Enhancements

* Added lib.distances.capped_distance function to quickly calculate all distances
up to a given maximum distance (PR #1941)
* Added a rotation coordinate transformation (PR #1937)
* Added a box centering trajectory transformation (PR #1946)
* Added a on-the-fly trajectory transformations API and a coordinate translation
Expand Down
210 changes: 210 additions & 0 deletions package/MDAnalysis/lib/distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
from .mdamath import triclinic_vectors, triclinic_box



# hack to select backend with backend=<backend> kwarg. Note that
# the cython parallel code (prange) in parallel.distances is
# independent from the OpenMP code
Expand Down Expand Up @@ -396,6 +397,215 @@ def self_distance_array(reference, box=None, result=None, backend="serial"):
return distances


def capped_distance(reference, configuration, max_cutoff, min_cutoff=None, box=None, method=None):
"""Calculates the pairs and distance within a specified distance

If a *box* is supplied, then a minimum image convention is used
to evaluate the distances.

An automatic guessing of optimized method to calculate the distances is
included in the function. An optional keyword for the method is also
provided. Users can override the method with this functionality.
Currently pkdtree and bruteforce are implemented.


Parameters
-----------
reference : array
reference coordinates array with shape ``reference.shape = (3,)``
or ``reference.shape = (len(reference), 3)``
configuration : array
Configuration coordinate array with shape ``reference.shape = (3,)``
or ``reference.shape = (len(reference), 3)``
max_cutoff : float
Maximum cutoff distance between the reference and configuration
min_cutoff : (optional) float
Minimum cutoff distance between reference and configuration [None]
box : (optional) array or None
The dimensions, if provided, must be provided in the same
The unitcell dimesions for this system format as returned
by :attr:`MDAnalysis.coordinates.base.Timestep.dimensions`:
``[lx,ly, lz, alpha, beta, gamma]``. Minimum image convention
is applied if the box is provided [None]
method : (optional) 'bruteforce' or 'pkdtree' or 'None'
Keyword to override the automatic guessing of method built-in
in the function [None]

Returns
-------
pairs : array
Pair of indices, one from each reference and configuration such that
distance between them is within the ``max_cutoff`` and ``min_cutoff``
pairs[i,j] contains the indices i from reference coordinates, and
j from configuration
distances : array
Distances corresponding to each pair of indices.
d[k] corresponding to the pairs[i,j] gives the distance between
i-th and j-th coordinate in reference and configuration respectively

.. code-block:: python

pairs, distances = capped_distances(reference, coordinates, max_cutoff)
for indx, [a,b] in enumerate(pairs):
coord1 = reference[a]
coord2 = configuration[b]
distance = distances[indx]

Note
-----
Currently only supports brute force and Periodic KDtree

.. SeeAlso:: :func:'MDAnalysis.lib.distances.distance_array'
.. SeeAlso:: :func:'MDAnalysis.lib.pkdtree.PeriodicKDTree'

"""

if box is not None:
boxtype = _box_check(box)
# Convert [A,B,C,alpha,beta,gamma] to [[A],[B],[C]]
if (boxtype == 'tri_box'):
box = triclinic_vectors(box)
if (boxtype == 'tri_vecs_bad'):
box = triclinic_vectors(triclinic_box(box[0], box[1], box[2]))
method = _determine_method(reference, configuration,
max_cutoff, min_cutoff=min_cutoff,
box=box, method=method)
pairs, dist = method(reference, configuration, max_cutoff,
min_cutoff=min_cutoff, box=box)

return np.asarray(pairs), np.asarray(dist)


def _determine_method(reference, configuration, max_cutoff, min_cutoff=None, box=None, method=None):
"""
Switch between different methods based on the the optimized time.
All the rules to select the method based on the input can be
incorporated here.

Returns
-------
Function object based on the rules and specified method
Currently implemented methods are
bruteforce : returns ``_bruteforce_capped``
PKDtree : return ``_pkdtree_capped`

"""
methods = {'bruteforce': _bruteforce_capped,
'pkdtree': _pkdtree_capped}

if method is not None:
return methods[method]

if len(reference) > 5000 and len(configuration) > 5000:
if box is None and reference.shape[0] != 3 and configuration.shape[0] != 3:
min_dim = np.array([reference.min(axis=0),
configuration.min(axis=0)])
max_dim = np.array([reference.max(axis=0),
configuration.max(axis=0)])
size = max_dim.max(axis=0) - min_dim.min(axis=0)
elif box is not None:
if box.shape[0] == 6:
size = box[:3]
else:
size = box.max(axis=0) - box.min(axis=0)

if (np.any(size < 10.0*max_cutoff) and
len(reference) > 100000 and
len(configuration) > 100000):
return methods['bruteforce']
else:
return methods['pkdtree']
return methods['bruteforce']


def _bruteforce_capped(reference, configuration, max_cutoff, min_cutoff=None, box=None):
"""
Using naive distance calulations, returns a list
containing the indices with one from each
reference and configuration arrays, such that the distance between
them is less than the specified cutoff distance
"""
pairs, distance = [], []

reference = np.asarray(reference, dtype=np.float32)
configuration = np.asarray(configuration, dtype=np.float32)

if reference.shape == (3, ):
reference = reference[None, :]
if configuration.shape == (3, ):
configuration = configuration[None, :]

_check_array(reference, 'reference')
_check_array(configuration, 'configuration')

for i, coords in enumerate(reference):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this way of iterating over one coordinate set is probably slower than distance_array(ref, conf). I think guess_bonds does it this way to avoid memory problems

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes exactly. The distance_array cannot go beyond 5k particles. So unless we are explicitly limiting the brute force algorithm to 5k, shouldn't the more robust way be a part of the calculation. I agree once we put in the rules, the iteration over each particle would be about half as fast as the distance_array and should be replaced.

dist = distance_array(coords[None, :], configuration, box=box)[0]
if min_cutoff is not None:
idx = np.where((dist <= max_cutoff) & (dist > min_cutoff))[0]
else:
idx = np.where((dist <= max_cutoff))[0]
for j in idx:
pairs.append((i, j))
distance.append(dist[j])
return pairs, distance


def _pkdtree_capped(reference, configuration, max_cutoff, min_cutoff=None, box=None):
""" Capped Distance evaluations using KDtree.

Uses minimum image convention if *box* is specified

Returns:
--------
pairs : list
List of atom indices which are within the specified cutoff distance.
pairs `(i, j)` corresponds to i-th particle in reference and
j-th particle in configuration
distance : list
Distance between two atoms corresponding to the (i, j) indices
in pairs.

"""
from .pkdtree import PeriodicKDTree
from Bio.KDTree import KDTree

pairs, distances = [], []

reference = np.asarray(reference, dtype=np.float32)
configuration = np.asarray(configuration, dtype=np.float32)

if reference.shape == (3, ):
reference = reference[None, :]
if configuration.shape == (3, ):
configuration = configuration[None, :]

_check_array(reference, 'reference')
_check_array(configuration, 'configuration')

# Build The KDTree
if box is not None:
kdtree = PeriodicKDTree(box, bucket_size=10)
else:
kdtree = KDTree(dim=3, bucket_size=10)

kdtree.set_coords(configuration)
# Search for every query point
for idx, centers in enumerate(reference):
kdtree.search(centers, max_cutoff)
indices = kdtree.get_indices()
dist = distance_array(centers.reshape((1, 3)),
configuration[indices], box=box)[0]
if min_cutoff is not None:
mask = np.where(dist > min_cutoff)[0]
dist = dist[mask]
indices = [indices[mask[i]] for i in range(len(mask))]
if len(indices) != 0:
for num, j in enumerate(indices):
pairs.append((idx, j))
distances.append(dist[num])
return pairs, distances


def transform_RtoS(inputcoords, box, backend="serial"):
"""Transform an array of coordinates from real space to S space (aka lambda space)

Expand Down
97 changes: 97 additions & 0 deletions testsuite/MDAnalysisTests/lib/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,12 @@
import numpy as np
from numpy.testing import assert_equal, assert_almost_equal

import itertools

import MDAnalysis as mda

from MDAnalysis.lib.mdamath import triclinic_vectors, triclinic_box

@pytest.mark.parametrize('coord_dtype', (np.float32, np.float64))
def test_transform_StoR_pass(coord_dtype):
box = np.array([10, 7, 3, 45, 60, 90], dtype=np.float32)
Expand All @@ -39,6 +43,99 @@ def test_transform_StoR_pass(coord_dtype):
assert_equal(original_r, test_r)


def test_capped_distance_noresults():
point1 = np.array([0.1, 0.1, 0.1], dtype=np.float32)
point2 = np.array([0.95, 0.1, 0.1], dtype=np.float32)

pairs, distances = mda.lib.distances.capped_distance(point1,
point2,
max_cutoff=0.2)

assert_equal(len(pairs), 0)


npoints_1 = (1, 100)

boxes_1 = (np.array([1, 2, 3, 90, 90, 90], dtype=np.float32), # ortho
np.array([1, 2, 3, 30, 45, 60], dtype=np.float32), # tri_box
triclinic_vectors( # tri_vecs
np.array([1, 2, 3, 90, 90, 45], dtype=np.float32)),
np.array([[0.5, 0.9, 1.9], # tri_vecs_bad
[2.0, 0.4, 0.1],
[0.0, 0.6, 0.5]], dtype=np.float32),
None, # Non Periodic
)


query_1 = (np.array([0.1, 0.1, 0.1], dtype=np.float32),
np.array([[0.1, 0.1, 0.1],
[0.2, 0.1, 0.1]], dtype=np.float32))

method_1 = ('bruteforce', 'pkdtree')

min_cutoff_1 = (None, 0.1)


@pytest.mark.parametrize('npoints', npoints_1)
@pytest.mark.parametrize('box', boxes_1)
@pytest.mark.parametrize('query', query_1)
@pytest.mark.parametrize('method', method_1)
@pytest.mark.parametrize('min_cutoff', min_cutoff_1)
def test_capped_distance_checkbrute(npoints, box, query, method, min_cutoff):
np.random.seed(90003)
points = (np.random.uniform(low=0, high=1.0,
size=(npoints, 3))*(boxes_1[0][:3])).astype(np.float32)
max_cutoff = 0.3
# capped distance should be able to handle array of vectors
# as well as single vectors.
pairs, dist = mda.lib.distances.capped_distance(query,
points,
max_cutoff,
min_cutoff=min_cutoff,
box=box,
method=method)
if pairs.shape != (0, ):
found_pairs = pairs[:, 1]
else:
found_pairs = list()

if(query.shape[0] == 3):
query = query.reshape((1, 3))

distances = mda.lib.distances.distance_array(query,
points, box=box)

if min_cutoff is None:
min_cutoff = 0.
indices = np.where((distances < max_cutoff) & (distances > min_cutoff))

assert_equal(np.sort(found_pairs, axis=0), np.sort(indices[1], axis=0))


@pytest.mark.parametrize('npoints,cutoff,meth',
[(1, 0.02, '_bruteforce_capped'),
(1, 0.2, '_bruteforce_capped'),
(6000, 0.02, '_pkdtree_capped'),
(6000, 0.2, '_pkdtree_capped'),
(200000, 0.02, '_pkdtree_capped'),
(200000, 0.2, '_bruteforce_capped')])
def test_method_selection(npoints, cutoff, meth):
np.random.seed(90003)
box = np.array([1, 1, 1, 90, 90, 90], dtype=np.float32)
points = (np.random.uniform(low=0, high=1.0,
size=(npoints, 3)) * (box[:3])).astype(np.float32)
if box is not None:
boxtype = mda.lib.distances._box_check(box)
# Convert [A,B,C,alpha,beta,gamma] to [[A],[B],[C]]
if (boxtype == 'tri_box'):
box = triclinic_vectors(box)
if (boxtype == 'tri_vecs_bad'):
box = triclinic_vectors(triclinic_box(box[0], box[1], box[2]))
method = mda.lib.distances._determine_method(points, points,
cutoff, box=box)
assert_equal(method.__name__, meth)


# different boxlengths to shift a coordinate
shifts = [
((0, 0, 0), (0, 0, 0), (0, 0, 0), (0, 0, 0)), # no shifting
Expand Down