From ec0bc557326c492e2997c550783819054225c2c5 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Thu, 25 Jul 2019 18:31:39 +0100 Subject: [PATCH] Small improvements. --- lib/iris/_lazy_data.py | 37 +++++++------------ .../tests/unit/lazy_data/test_as_lazy_data.py | 29 +++++++++++---- 2 files changed, 35 insertions(+), 31 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index f35baceb607..e366f0ef844 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -60,19 +60,6 @@ def is_lazy_data(data): return result -# A magic value, chosen to minimise chunk creation time and chunk processing -# time within dask. -_MAX_CHUNK_SIZE = 8 * 1024 * 1024 * 2 - -# _MAX_CHUNK_SIZE = dask.config.get('array', {}).get('chunk-size', None) -# if _MAX_CHUNK_SIZE is not None: -# # Convert to bytes -# _MAX_CHUNK_SIZE = da.core.parse_bytes(_MAX_CHUNK_SIZE) -# else: -# # Fall back on our own "magic" value. -# _MAX_CHUNK_SIZE = 8 * 1024 * 1024 * 2 - - def _optimise_chunksize(chunks, shape, limit=None, dtype=np.dtype('f4')): @@ -88,7 +75,7 @@ def _optimise_chunksize(chunks, shape, * shape (tuple of int): The full array shape of the target data. * limit (int): - The 'ideal' target chunk size, in bytes. + The 'ideal' target chunk size, in bytes. Default from dask.config. * dtype (np.dtype): Numpy dtype of target data. @@ -97,18 +84,18 @@ def _optimise_chunksize(chunks, shape, The proposed shape of one full chunk. .. note:: - The use of this is very similar to `dask.array.core.normalize_chunks`, - when called as + The purpose of this is very similar to + `dask.array.core.normalize_chunks`, when called as `(chunks='auto', shape, dtype=dtype, previous_chunks=chunks, ...)`. - Except : that the logic here is optimised for a specific to a 'c-like' - dimension order, i.e. outer dimensions first, as in netcdf variables. + Except, the operation here is optimised specifically for a 'c-like' + dimension order, i.e. outer dimensions first, as for netcdf variables. So if, in future, this policy can be implemented in dask, then we would prefer to replace this function with a call to that one. Accordingly, the arguments roughly match 'normalize_chunks', except that we don't support the alternative argument forms of that routine. - This routine also returns a single 'full chunk', rather - than a complete chunking scheme : so equivalent code usage would be - "chunks = [c[0] for c in normalise_chunks(chunks, ...)]". + The return value, however, is a single 'full chunk', rather than a + complete chunking scheme : so an equivalent code usage could be + "chunks = [c[0] for c in normalise_chunks('auto', ...)]". """ # Return chunks unchanged, for types of invocation we don't comprehend. @@ -119,9 +106,13 @@ def _optimise_chunksize(chunks, shape, # or if shape contains 0 or -1 (like raw landsea-mask data proxies). return chunks - # Calculate default chunksize limit. + # Set the chunksize limit. if limit is None: - limit = _MAX_CHUNK_SIZE * 4 + # Fetch the default 'optimal' chunksize from the dask config. + limit = dask.config.get('array.chunk-size') + # Convert to bytes + limit = da.core.parse_bytes(limit) + point_size_limit = limit / dtype.itemsize # Create result chunks, starting with a copy of the input. diff --git a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py index 94d22f34e85..c18c34e7ede 100644 --- a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py +++ b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py @@ -24,17 +24,17 @@ import iris.tests as tests import dask.array as da +import dask.config import numpy as np import numpy.ma as ma -from iris._lazy_data import as_lazy_data, _MAX_CHUNK_SIZE, _optimise_chunksize +from iris._lazy_data import as_lazy_data, _optimise_chunksize from iris.tests import mock class Test_as_lazy_data(tests.IrisTest): def test_lazy(self): - data = da.from_array(np.arange(24).reshape((2, 3, 4)), - chunks=_MAX_CHUNK_SIZE) + data = da.from_array(np.arange(24).reshape((2, 3, 4)), chunks='auto') result = as_lazy_data(data) self.assertIsInstance(result, da.core.Array) @@ -63,6 +63,9 @@ def test_with_masked_constant(self): class Test__optimised_chunks(tests.IrisTest): + # Stable, known chunksize for testing. + FIXED_CHUNKSIZE_LIMIT = 1024 * 1024 * 64 + @staticmethod def _dummydata(shape): return mock.Mock(spec=da.core.Array, @@ -70,7 +73,7 @@ def _dummydata(shape): shape=shape) def test_chunk_size_limiting(self): - # Check the default chunksizes for large data. + # Check default chunksizes for large data (with a known size limit). given_shapes_and_resulting_chunks = [ ((16, 1024, 1024), (16, 1024, 1024)), # largest unmodified ((17, 1011, 1022), (8, 1011, 1022)), @@ -81,14 +84,15 @@ def test_chunk_size_limiting(self): ] err_fmt = 'Result of optimising chunks {} was {}, expected {}' for (shape, expected) in given_shapes_and_resulting_chunks: - chunks = _optimise_chunksize(shape, shape) + chunks = _optimise_chunksize(shape, shape, + limit=self.FIXED_CHUNKSIZE_LIMIT) msg = err_fmt.format(shape, chunks, expected) self.assertEqual(chunks, expected, msg) def test_chunk_size_expanding(self): - # Check the default chunksizes for small data. + # Check the expansion of small chunks, (with a known size limit). given_shapes_and_resulting_chunks = [ - ((1, 100, 100), (16, 100, 100), (16, 100, 100)), # large case + ((1, 100, 100), (16, 100, 100), (16, 100, 100)), ((1, 100, 100), (5000, 100, 100), (1677, 100, 100)), ((3, 300, 200), (10000, 3000, 2000), (3, 2700, 2000)), ((3, 300, 200), (10000, 300, 2000), (27, 300, 2000)), @@ -96,10 +100,19 @@ def test_chunk_size_expanding(self): ] err_fmt = 'Result of optimising shape={};chunks={} was {}, expected {}' for (shape, fullshape, expected) in given_shapes_and_resulting_chunks: - chunks = _optimise_chunksize(chunks=shape, shape=fullshape) + chunks = _optimise_chunksize(chunks=shape, shape=fullshape, + limit=self.FIXED_CHUNKSIZE_LIMIT) msg = err_fmt.format(fullshape, shape, chunks, expected) self.assertEqual(chunks, expected, msg) + def test_default_chunksize(self): + # Check that the "ideal" chunksize is taken from the dask config. + with dask.config.set({'array.chunk-size': '20b'}): + chunks = _optimise_chunksize((1, 8), + shape=(400,20), + dtype=np.dtype('f4')) + self.assertEqual(chunks, (1, 4)) + def test_default_chunks_limiting(self): # Check that chunking is still controlled when no specific 'chunks' # is passed.