Small improvements.

SciTools · Jul 25, 2019 · ec0bc55 · ec0bc55
1 parent 019f8f4
commit ec0bc55
Show file tree

Hide file tree

Showing 2 changed files with 35 additions and 31 deletions.
diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py
@@ -60,19 +60,6 @@ def is_lazy_data(data):
     return result
 
 
-# A magic value, chosen to minimise chunk creation time and chunk processing
-# time within dask.
-_MAX_CHUNK_SIZE = 8 * 1024 * 1024 * 2
-
-# _MAX_CHUNK_SIZE = dask.config.get('array', {}).get('chunk-size', None)
-# if _MAX_CHUNK_SIZE is not None:
-#     # Convert to bytes
-#     _MAX_CHUNK_SIZE = da.core.parse_bytes(_MAX_CHUNK_SIZE)
-# else:
-#     # Fall back on our own "magic" value.
-#     _MAX_CHUNK_SIZE = 8 * 1024 * 1024 * 2
-
-
 def _optimise_chunksize(chunks, shape,
                         limit=None,
                         dtype=np.dtype('f4')):
@@ -88,7 +75,7 @@ def _optimise_chunksize(chunks, shape,
     * shape (tuple of int):
         The full array shape of the target data.
     * limit (int):
-        The 'ideal' target chunk size, in bytes.
+        The 'ideal' target chunk size, in bytes.  Default from dask.config.
     * dtype (np.dtype):
         Numpy dtype of target data.
 
@@ -97,18 +84,18 @@ def _optimise_chunksize(chunks, shape,
         The proposed shape of one full chunk.
 
     .. note::
-        The use of this is very similar to `dask.array.core.normalize_chunks`,
-        when called as
+        The purpose of this is very similar to
+        `dask.array.core.normalize_chunks`, when called as
         `(chunks='auto', shape, dtype=dtype, previous_chunks=chunks, ...)`.
-        Except : that the logic here is optimised for a specific to a 'c-like'
-        dimension order, i.e. outer dimensions first, as in netcdf variables.
+        Except, the operation here is optimised specifically for a 'c-like'
+        dimension order, i.e. outer dimensions first, as for netcdf variables.
         So if, in future, this policy can be implemented in dask, then we would
         prefer to replace this function with a call to that one.
         Accordingly, the arguments roughly match 'normalize_chunks', except
         that we don't support the alternative argument forms of that routine.
-        This routine also returns a single 'full chunk', rather
-        than a complete chunking scheme : so equivalent code usage would be
-        "chunks = [c[0] for c in normalise_chunks(chunks, ...)]".
+        The return value, however, is a single 'full chunk', rather than a
+        complete chunking scheme : so an equivalent code usage could be
+        "chunks = [c[0] for c in normalise_chunks('auto', ...)]".
 
     """
     # Return chunks unchanged, for types of invocation we don't comprehend.
@@ -119,9 +106,13 @@ def _optimise_chunksize(chunks, shape,
         # or if shape contains 0 or -1 (like raw landsea-mask data proxies).
         return chunks
 
-    # Calculate default chunksize limit.
+    # Set the chunksize limit.
     if limit is None:
-        limit = _MAX_CHUNK_SIZE * 4
+        # Fetch the default 'optimal' chunksize from the dask config.
+        limit = dask.config.get('array.chunk-size')
+        # Convert to bytes
+        limit = da.core.parse_bytes(limit)
+
     point_size_limit = limit / dtype.itemsize
 
     # Create result chunks, starting with a copy of the input.

diff --git a/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py b/lib/iris/tests/unit/lazy_data/test_as_lazy_data.py
@@ -24,17 +24,17 @@
 import iris.tests as tests
 
 import dask.array as da
+import dask.config
 import numpy as np
 import numpy.ma as ma
 
-from iris._lazy_data import as_lazy_data, _MAX_CHUNK_SIZE, _optimise_chunksize
+from iris._lazy_data import as_lazy_data, _optimise_chunksize
 from iris.tests import mock
 
 
 class Test_as_lazy_data(tests.IrisTest):
     def test_lazy(self):
-        data = da.from_array(np.arange(24).reshape((2, 3, 4)),
-                             chunks=_MAX_CHUNK_SIZE)
+        data = da.from_array(np.arange(24).reshape((2, 3, 4)), chunks='auto')
         result = as_lazy_data(data)
         self.assertIsInstance(result, da.core.Array)
 
@@ -63,14 +63,17 @@ def test_with_masked_constant(self):
 
 
 class Test__optimised_chunks(tests.IrisTest):
+    # Stable, known chunksize for testing.
+    FIXED_CHUNKSIZE_LIMIT = 1024 * 1024 * 64
+
     @staticmethod
     def _dummydata(shape):
         return mock.Mock(spec=da.core.Array,
                          dtype=np.dtype('f4'),
                          shape=shape)
 
     def test_chunk_size_limiting(self):
-        # Check the default chunksizes for large data.
+        # Check default chunksizes for large data (with a known size limit).
         given_shapes_and_resulting_chunks = [
             ((16, 1024, 1024), (16, 1024, 1024)),  # largest unmodified
             ((17, 1011, 1022), (8, 1011, 1022)),
@@ -81,25 +84,35 @@ def test_chunk_size_limiting(self):
         ]
         err_fmt = 'Result of optimising chunks {} was {}, expected {}'
         for (shape, expected) in given_shapes_and_resulting_chunks:
-            chunks = _optimise_chunksize(shape, shape)
+            chunks = _optimise_chunksize(shape, shape,
+                                         limit=self.FIXED_CHUNKSIZE_LIMIT)
             msg = err_fmt.format(shape, chunks, expected)
             self.assertEqual(chunks, expected, msg)
 
     def test_chunk_size_expanding(self):
-        # Check the default chunksizes for small data.
+        # Check the expansion of small chunks, (with a known size limit).
         given_shapes_and_resulting_chunks = [
-            ((1, 100, 100), (16, 100, 100), (16, 100, 100)),  # large case
+            ((1, 100, 100), (16, 100, 100), (16, 100, 100)),
             ((1, 100, 100), (5000, 100, 100), (1677, 100, 100)),
             ((3, 300, 200), (10000, 3000, 2000), (3, 2700, 2000)),
             ((3, 300, 200), (10000, 300, 2000), (27, 300, 2000)),
             ((3, 300, 200), (8, 300, 2000), (8, 300, 2000)),
         ]
         err_fmt = 'Result of optimising shape={};chunks={} was {}, expected {}'
         for (shape, fullshape, expected) in given_shapes_and_resulting_chunks:
-            chunks = _optimise_chunksize(chunks=shape, shape=fullshape)
+            chunks = _optimise_chunksize(chunks=shape, shape=fullshape,
+                                         limit=self.FIXED_CHUNKSIZE_LIMIT)
             msg = err_fmt.format(fullshape, shape, chunks, expected)
             self.assertEqual(chunks, expected, msg)
 
+    def test_default_chunksize(self):
+        # Check that the "ideal" chunksize is taken from the dask config.
+        with dask.config.set({'array.chunk-size': '20b'}):
+            chunks = _optimise_chunksize((1, 8),
+                                         shape=(400,20),
+                                         dtype=np.dtype('f4'))
+            self.assertEqual(chunks, (1, 4))
+
     def test_default_chunks_limiting(self):
         # Check that chunking is still controlled when no specific 'chunks'
         # is passed.