Skip to content

Commit

Permalink
Small improvements.
Browse files Browse the repository at this point in the history
  • Loading branch information
pp-mo committed Jul 25, 2019
1 parent 019f8f4 commit ec0bc55
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 31 deletions.
37 changes: 14 additions & 23 deletions lib/iris/_lazy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,19 +60,6 @@ def is_lazy_data(data):
return result


# A magic value, chosen to minimise chunk creation time and chunk processing
# time within dask.
_MAX_CHUNK_SIZE = 8 * 1024 * 1024 * 2

# _MAX_CHUNK_SIZE = dask.config.get('array', {}).get('chunk-size', None)
# if _MAX_CHUNK_SIZE is not None:
# # Convert to bytes
# _MAX_CHUNK_SIZE = da.core.parse_bytes(_MAX_CHUNK_SIZE)
# else:
# # Fall back on our own "magic" value.
# _MAX_CHUNK_SIZE = 8 * 1024 * 1024 * 2


def _optimise_chunksize(chunks, shape,
limit=None,
dtype=np.dtype('f4')):
Expand All @@ -88,7 +75,7 @@ def _optimise_chunksize(chunks, shape,
* shape (tuple of int):
The full array shape of the target data.
* limit (int):
The 'ideal' target chunk size, in bytes.
The 'ideal' target chunk size, in bytes. Default from dask.config.
* dtype (np.dtype):
Numpy dtype of target data.
Expand All @@ -97,18 +84,18 @@ def _optimise_chunksize(chunks, shape,
The proposed shape of one full chunk.
.. note::
The use of this is very similar to `dask.array.core.normalize_chunks`,
when called as
The purpose of this is very similar to
`dask.array.core.normalize_chunks`, when called as
`(chunks='auto', shape, dtype=dtype, previous_chunks=chunks, ...)`.
Except : that the logic here is optimised for a specific to a 'c-like'
dimension order, i.e. outer dimensions first, as in netcdf variables.
Except, the operation here is optimised specifically for a 'c-like'
dimension order, i.e. outer dimensions first, as for netcdf variables.
So if, in future, this policy can be implemented in dask, then we would
prefer to replace this function with a call to that one.
Accordingly, the arguments roughly match 'normalize_chunks', except
that we don't support the alternative argument forms of that routine.
This routine also returns a single 'full chunk', rather
than a complete chunking scheme : so equivalent code usage would be
"chunks = [c[0] for c in normalise_chunks(chunks, ...)]".
The return value, however, is a single 'full chunk', rather than a
complete chunking scheme : so an equivalent code usage could be
"chunks = [c[0] for c in normalise_chunks('auto', ...)]".
"""
# Return chunks unchanged, for types of invocation we don't comprehend.
Expand All @@ -119,9 +106,13 @@ def _optimise_chunksize(chunks, shape,
# or if shape contains 0 or -1 (like raw landsea-mask data proxies).
return chunks

# Calculate default chunksize limit.
# Set the chunksize limit.
if limit is None:
limit = _MAX_CHUNK_SIZE * 4
# Fetch the default 'optimal' chunksize from the dask config.
limit = dask.config.get('array.chunk-size')
# Convert to bytes
limit = da.core.parse_bytes(limit)

point_size_limit = limit / dtype.itemsize

# Create result chunks, starting with a copy of the input.
Expand Down
29 changes: 21 additions & 8 deletions lib/iris/tests/unit/lazy_data/test_as_lazy_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,17 +24,17 @@
import iris.tests as tests

import dask.array as da
import dask.config
import numpy as np
import numpy.ma as ma

from iris._lazy_data import as_lazy_data, _MAX_CHUNK_SIZE, _optimise_chunksize
from iris._lazy_data import as_lazy_data, _optimise_chunksize
from iris.tests import mock


class Test_as_lazy_data(tests.IrisTest):
def test_lazy(self):
data = da.from_array(np.arange(24).reshape((2, 3, 4)),
chunks=_MAX_CHUNK_SIZE)
data = da.from_array(np.arange(24).reshape((2, 3, 4)), chunks='auto')
result = as_lazy_data(data)
self.assertIsInstance(result, da.core.Array)

Expand Down Expand Up @@ -63,14 +63,17 @@ def test_with_masked_constant(self):


class Test__optimised_chunks(tests.IrisTest):
# Stable, known chunksize for testing.
FIXED_CHUNKSIZE_LIMIT = 1024 * 1024 * 64

@staticmethod
def _dummydata(shape):
return mock.Mock(spec=da.core.Array,
dtype=np.dtype('f4'),
shape=shape)

def test_chunk_size_limiting(self):
# Check the default chunksizes for large data.
# Check default chunksizes for large data (with a known size limit).
given_shapes_and_resulting_chunks = [
((16, 1024, 1024), (16, 1024, 1024)), # largest unmodified
((17, 1011, 1022), (8, 1011, 1022)),
Expand All @@ -81,25 +84,35 @@ def test_chunk_size_limiting(self):
]
err_fmt = 'Result of optimising chunks {} was {}, expected {}'
for (shape, expected) in given_shapes_and_resulting_chunks:
chunks = _optimise_chunksize(shape, shape)
chunks = _optimise_chunksize(shape, shape,
limit=self.FIXED_CHUNKSIZE_LIMIT)
msg = err_fmt.format(shape, chunks, expected)
self.assertEqual(chunks, expected, msg)

def test_chunk_size_expanding(self):
# Check the default chunksizes for small data.
# Check the expansion of small chunks, (with a known size limit).
given_shapes_and_resulting_chunks = [
((1, 100, 100), (16, 100, 100), (16, 100, 100)), # large case
((1, 100, 100), (16, 100, 100), (16, 100, 100)),
((1, 100, 100), (5000, 100, 100), (1677, 100, 100)),
((3, 300, 200), (10000, 3000, 2000), (3, 2700, 2000)),
((3, 300, 200), (10000, 300, 2000), (27, 300, 2000)),
((3, 300, 200), (8, 300, 2000), (8, 300, 2000)),
]
err_fmt = 'Result of optimising shape={};chunks={} was {}, expected {}'
for (shape, fullshape, expected) in given_shapes_and_resulting_chunks:
chunks = _optimise_chunksize(chunks=shape, shape=fullshape)
chunks = _optimise_chunksize(chunks=shape, shape=fullshape,
limit=self.FIXED_CHUNKSIZE_LIMIT)
msg = err_fmt.format(fullshape, shape, chunks, expected)
self.assertEqual(chunks, expected, msg)

def test_default_chunksize(self):
# Check that the "ideal" chunksize is taken from the dask config.
with dask.config.set({'array.chunk-size': '20b'}):
chunks = _optimise_chunksize((1, 8),
shape=(400,20),
dtype=np.dtype('f4'))
self.assertEqual(chunks, (1, 4))

def test_default_chunks_limiting(self):
# Check that chunking is still controlled when no specific 'chunks'
# is passed.
Expand Down

0 comments on commit ec0bc55

Please sign in to comment.