Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Find pdist with known shape #71

Merged
merged 4 commits into from
Oct 7, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions dask_distance/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,9 @@ def pdist(X, metric="euclidean", **kwargs):

result = cdist(X, X, metric, **kwargs)

result = dask.array.triu(result, 1)

indices = _compat._indices(result.shape, chunks=result.chunks)
mask = (indices[1] > indices[0])

result = _compat._ravel(result)[_compat._ravel(mask)]
result = dask.array.concatenate([
result[i, i + 1:] for i in range(0, len(result) - 1)
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missed using irange here. Fixing in PR ( #80 ).

])
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am a little concerned about this performance-wise for large numbers of points. Reason being this makes the graph balloon with getitem entries. Would be good if we could cut this down somehow, but it is not obvious to me how without reusing the old masking strategy.

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That said, it seems to do ok given reasonable chunk sizes when playing around with it locally. So perhaps this is not worth worrying about until use cases that have issues present themselves.


return result

Expand Down
76 changes: 0 additions & 76 deletions dask_distance/_compat.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-


import itertools

import numpy

import dask
Expand All @@ -28,77 +26,3 @@ def _asarray(a):
a = dask.array.from_array(a, a.shape)

return a


def _indices(dimensions, dtype=int, chunks=None):
"""
Implements NumPy's ``indices`` for Dask Arrays.
Generates a grid of indices covering the dimensions provided.
The final array has the shape ``(len(dimensions), *dimensions)``. The
chunks are used to specify the chunking for axis 1 up to
``len(dimensions)``. The 0th axis always has chunks of length 1.

Parameters
----------
dimensions : sequence of ints
The shape of the index grid.
dtype : dtype, optional
Type to use for the array. Default is ``int``.
chunks : sequence of ints
The number of samples on each block. Note that the last block will
have fewer samples if ``len(array) % chunks != 0``.

Returns
-------
grid : dask array

Notes
-----
Borrowed from my Dask Array contribution.
"""
if chunks is None:
raise ValueError("Must supply a chunks= keyword argument")

dimensions = tuple(dimensions)
dtype = numpy.dtype(dtype)
chunks = tuple(chunks)

if len(dimensions) != len(chunks):
raise ValueError("Need one more chunk than dimensions.")

grid = []
if numpy.prod(dimensions):
for i in range(len(dimensions)):
s = len(dimensions) * [None]
s[i] = slice(None)
s = tuple(s)

r = dask.array.arange(dimensions[i], dtype=dtype, chunks=chunks[i])
r = r[s]

for j in itertools.chain(range(i), range(i + 1, len(dimensions))):
r = r.repeat(dimensions[j], axis=j)

grid.append(r)

if grid:
grid = dask.array.stack(grid)
else:
grid = dask.array.empty(
(len(dimensions),) + dimensions, dtype=dtype, chunks=(1,) + chunks
)

return grid


def _ravel(a):
a = _asarray(a)

r = a
try:
r = r.ravel()
except ValueError:
# Fallback for Dask pre-0.14.1.
r = r.rechunk(r.chunks[:1] + r.shape[1:]).ravel()

return r
82 changes: 0 additions & 82 deletions tests/test__compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,85 +32,3 @@ def test_asarray(x):
x = np.asarray(x)

dau.assert_eq(d, x)


def test_indices_no_chunks():
with pytest.raises(ValueError):
dask_distance._compat._indices((1,))


def test_indices_wrong_chunks():
with pytest.raises(ValueError):
dask_distance._compat._indices((1,), chunks=tuple())


@pytest.mark.parametrize(
"dimensions, dtype, chunks",
[
(tuple(), int, tuple()),
(tuple(), float, tuple()),
((0,), float, (1,)),
((0, 1, 2), float, (1, 1, 2)),
]
)
def test_empty_indicies(dimensions, dtype, chunks):
darr = dask_distance._compat._indices(dimensions, dtype, chunks=chunks)
nparr = np.indices(dimensions, dtype)

assert darr.shape == nparr.shape
assert darr.dtype == nparr.dtype

try:
dau.assert_eq(darr, nparr)
except IndexError:
if len(dimensions) and old_dask:
pytest.skip(
"Dask pre-0.14.0 is unable to compute this empty array."
)
else:
raise


def test_indicies():
darr = dask_distance._compat._indices((1,), chunks=(1,))
nparr = np.indices((1,))
dau.assert_eq(darr, nparr)

darr = dask_distance._compat._indices((1,), float, chunks=(1,))
nparr = np.indices((1,), float)
dau.assert_eq(darr, nparr)

darr = dask_distance._compat._indices((2, 1), chunks=(2, 1))
nparr = np.indices((2, 1))
dau.assert_eq(darr, nparr)

darr = dask_distance._compat._indices((2, 3), chunks=(1, 2))
nparr = np.indices((2, 3))
dau.assert_eq(darr, nparr)


@pytest.mark.parametrize(
"shape, dtype, chunks",
[
((10, 11, 12), int, (3, 5, 5)),
((10, 11, 12), float, (3, 5, 5)),
((10, 11, 12), float, (3, 2, 2)),
((20, 17, 31), float, (6, 5, 10)),
]
)
@pytest.mark.parametrize(
"seed",
[
153,
]
)
def test_ravel(shape, dtype, chunks, seed):
np.random.random(seed)

a = np.random.randint(0, 10, shape).astype(dtype)
d = da.from_array(a, chunks=chunks)

r_a = np.ravel(a)
r_d = dask_distance._compat._ravel(d)

dau.assert_eq(r_d, r_a)
2 changes: 2 additions & 0 deletions tests/test_dask_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def test_2d_pdist(metric, kw, seed, u_shape, u_chunks):
a_r = spdist.pdist(a_u, metric, **kw)
d_r = dask_distance.pdist(d_u, metric, **kw)

assert d_r.shape == a_r.shape
assert np.allclose(np.array(d_r)[()], a_r, equal_nan=True)


Expand Down Expand Up @@ -362,4 +363,5 @@ def test_2d_bool_pdist(metric, seed, u_shape, u_chunks):
a_r = spdist.pdist(a_u, metric)
d_r = dask_distance.pdist(d_u, metric)

assert d_r.shape == a_r.shape
assert np.allclose(np.array(d_r)[()], a_r, equal_nan=True)