From d726fc2dfd6536f61279621fdfbf39ca162bffc1 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 14 May 2021 12:07:41 +0100 Subject: [PATCH 1/5] Use 'meta' in da.from_array to stop it sampling netcdf variables, which is quite slow. --- docs/iris/src/whatsnew/3.0.2.rst | 4 ++++ lib/iris/_lazy_data.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/docs/iris/src/whatsnew/3.0.2.rst b/docs/iris/src/whatsnew/3.0.2.rst index 769b673a4c..7bacfa4ce6 100644 --- a/docs/iris/src/whatsnew/3.0.2.rst +++ b/docs/iris/src/whatsnew/3.0.2.rst @@ -42,6 +42,10 @@ This document explains the changes made to Iris for this release developers to easily disable `cirrus-ci`_ tasks. See :ref:`skipping Cirrus-CI tasks`. (:pull:`4019`) [``pre-v3.1.0``] + #. `@pp-mo`_ adjusted the use of :func:`dask.array.from_array` in :func:`iris._lazy_data.as_lazy_data`, to avoid + the dask 'test access'. This makes loading of netcdf files with large numbers variables significantly faster. + (:pull:`4134`) [``pre-v3.1.0``] + Note that, the contributions labelled ``pre-v3.1.0`` are part of the forthcoming Iris v3.1.0 release, but require to be included in this patch release. diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index dcc0674fe6..c549cb4679 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -192,7 +192,9 @@ def as_lazy_data(data, chunks=None, asarray=False): if isinstance(data, ma.core.MaskedConstant): data = ma.masked_array(data.data, mask=data.mask) if not is_lazy_data(data): - data = da.from_array(data, chunks=chunks, asarray=asarray) + data = da.from_array( + data, chunks=chunks, asarray=asarray, meta=np.ndarray + ) return data From 89a2194ffd825f415afcb6c96ed4136d80ea0c96 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 14 May 2021 12:19:44 +0100 Subject: [PATCH 2/5] Fix PR number. --- docs/iris/src/whatsnew/3.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/iris/src/whatsnew/3.0.2.rst b/docs/iris/src/whatsnew/3.0.2.rst index 7bacfa4ce6..bee5d65bc8 100644 --- a/docs/iris/src/whatsnew/3.0.2.rst +++ b/docs/iris/src/whatsnew/3.0.2.rst @@ -44,7 +44,7 @@ This document explains the changes made to Iris for this release #. `@pp-mo`_ adjusted the use of :func:`dask.array.from_array` in :func:`iris._lazy_data.as_lazy_data`, to avoid the dask 'test access'. This makes loading of netcdf files with large numbers variables significantly faster. - (:pull:`4134`) [``pre-v3.1.0``] + (:pull:`4135`) [``pre-v3.1.0``] Note that, the contributions labelled ``pre-v3.1.0`` are part of the forthcoming Iris v3.1.0 release, but require to be included in this patch release. From f43f49493c0b2121b548225618a54696866726c4 Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 14 May 2021 12:27:35 +0100 Subject: [PATCH 3/5] Fix test. --- .../tests/unit/lazy_data/test_co_realise_cubes.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py index b7365a4316..212f415a7e 100644 --- a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py +++ b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py @@ -71,12 +71,12 @@ def test_combined_access(self): cube_e = Cube(derived_e) co_realise_cubes(cube_a, cube_b, cube_c, cube_d, cube_e) # Though used more than once, the source data should only get fetched - # twice by dask. Once when dask performs an initial data access with - # no data payload to ascertain the metadata associated with the - # dask.array (this access is specific to dask 2+, see - # dask.array.utils.meta_from_array), and again when the whole data is - # accessed. - self.assertEqual(wrapped_array.access_count, 2) + # once by dask, when the whole data is accessed. + # This also ensures that dask does *not* performs an initial data + # access with no data payload to ascertain the metadata associated with + # the dask.array (this access is specific to dask 2+, + # see dask.array.utils.meta_from_array). + self.assertEqual(wrapped_array.access_count, 1) if __name__ == "__main__": From f57868c4bd016ede936acc060c3f8c942f8da7ef Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Mon, 17 May 2021 14:44:49 +0100 Subject: [PATCH 4/5] Review changes. --- docs/iris/src/whatsnew/3.0.2.rst | 2 +- lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/iris/src/whatsnew/3.0.2.rst b/docs/iris/src/whatsnew/3.0.2.rst index bee5d65bc8..3d94cefa17 100644 --- a/docs/iris/src/whatsnew/3.0.2.rst +++ b/docs/iris/src/whatsnew/3.0.2.rst @@ -43,7 +43,7 @@ This document explains the changes made to Iris for this release :ref:`skipping Cirrus-CI tasks`. (:pull:`4019`) [``pre-v3.1.0``] #. `@pp-mo`_ adjusted the use of :func:`dask.array.from_array` in :func:`iris._lazy_data.as_lazy_data`, to avoid - the dask 'test access'. This makes loading of netcdf files with large numbers variables significantly faster. + the dask 'test access'. This makes loading of netcdf files with a large number of variables significantly faster. (:pull:`4135`) [``pre-v3.1.0``] Note that, the contributions labelled ``pre-v3.1.0`` are part of the forthcoming diff --git a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py index 212f415a7e..2e3be105ed 100644 --- a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py +++ b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py @@ -72,7 +72,7 @@ def test_combined_access(self): co_realise_cubes(cube_a, cube_b, cube_c, cube_d, cube_e) # Though used more than once, the source data should only get fetched # once by dask, when the whole data is accessed. - # This also ensures that dask does *not* performs an initial data + # This also ensures that dask does *not* perform an initial data # access with no data payload to ascertain the metadata associated with # the dask.array (this access is specific to dask 2+, # see dask.array.utils.meta_from_array). From 7b3d8511919915ef247aee248fb5c7caa8732a18 Mon Sep 17 00:00:00 2001 From: lbdreyer Date: Mon, 24 May 2021 22:07:29 +0100 Subject: [PATCH 5/5] Update docs/iris/src/whatsnew/3.0.2.rst --- docs/iris/src/whatsnew/3.0.2.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/iris/src/whatsnew/3.0.2.rst b/docs/iris/src/whatsnew/3.0.2.rst index 3d94cefa17..44bce83643 100644 --- a/docs/iris/src/whatsnew/3.0.2.rst +++ b/docs/iris/src/whatsnew/3.0.2.rst @@ -44,7 +44,7 @@ This document explains the changes made to Iris for this release #. `@pp-mo`_ adjusted the use of :func:`dask.array.from_array` in :func:`iris._lazy_data.as_lazy_data`, to avoid the dask 'test access'. This makes loading of netcdf files with a large number of variables significantly faster. - (:pull:`4135`) [``pre-v3.1.0``] + (:pull:`4135`) Note that, the contributions labelled ``pre-v3.1.0`` are part of the forthcoming Iris v3.1.0 release, but require to be included in this patch release.