diff --git a/esmvalcore/_recipe_checks.py b/esmvalcore/_recipe_checks.py index 21cff6a4a9..1133a36639 100644 --- a/esmvalcore/_recipe_checks.py +++ b/esmvalcore/_recipe_checks.py @@ -9,7 +9,8 @@ import yamale from ._data_finder import get_start_end_year -from .preprocessor import PreprocessingTask, TIME_PREPROCESSORS +from .preprocessor import TIME_PREPROCESSORS, PreprocessingTask +from .preprocessor._multimodel import STATISTIC_MAPPING logger = logging.getLogger(__name__) @@ -191,7 +192,7 @@ def extract_shape(settings): def valid_multimodel_statistic(statistic): """Check that `statistic` is a valid argument for multimodel stats.""" - valid_names = ["mean", "median", "std", "min", "max"] + valid_names = ['std'] + list(STATISTIC_MAPPING.keys()) valid_patterns = [r"^(p\d{1,2})(\.\d*)?$"] if not (statistic in valid_names or re.match(r'|'.join(valid_patterns), statistic)): diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py index ad010ae097..8469f9177b 100644 --- a/esmvalcore/preprocessor/_multimodel.py +++ b/esmvalcore/preprocessor/_multimodel.py @@ -3,184 +3,67 @@ import logging import re from datetime import datetime -from functools import partial, reduce +from functools import reduce import cf_units +import dask.array as da import iris import numpy as np -import scipy +from iris.util import equalise_attributes logger = logging.getLogger(__name__) +STATISTIC_MAPPING = { + 'gmean': iris.analysis.GMEAN, # not lazy in iris + 'hmean': iris.analysis.HMEAN, # not lazy in iris + 'max': iris.analysis.MAX, + 'median': iris.analysis.MEDIAN, # not lazy in iris + 'min': iris.analysis.MIN, + 'rms': iris.analysis.RMS, + 'sum': iris.analysis.SUM, + 'mean': iris.analysis.MEAN, + 'std_dev': iris.analysis.STD_DEV, + 'variance': iris.analysis.VARIANCE, + # The following require extra kwargs, + # atm this is only supported for percentiles via e.g. `pXX` + 'count': iris.analysis.COUNT, + 'peak': iris.analysis.PEAK, + 'percentile': iris.analysis.PERCENTILE, # not lazy in iris + 'proportion': iris.analysis.PROPORTION, # not lazy in iris + 'wpercentile': iris.analysis.WPERCENTILE, # not lazy in iris +} + +CONCAT_DIM = 'multi-model' + + +def _resolve_operator(statistic: str): + """Find the operator corresponding to the statistic.""" + statistic = statistic.lower() + kwargs = {} + + # special cases + if statistic == 'std': + logger.warning( + "Changing statistics from specified `std` to `std_dev`, " + "since multimodel statistics is now using the iris.analysis module" + ", which also uses `std_dev`. Please consider replacing 'std' " + " with 'std_dev' in your recipe or code.") + statistic = 'std_dev' + + elif re.match(r"^(p\d{1,2})(\.\d*)?$", statistic): + # percentiles between p0 and p99.99999... + percentile = float(statistic[1:]) + kwargs['percent'] = percentile + statistic = 'percentile' -def _plev_fix(dataset, pl_idx): - """Extract valid plev data. - - This function takes care of situations in which certain plevs are - completely masked due to unavailable interpolation boundaries. - """ - if np.ma.is_masked(dataset): - # keep only the valid plevs - if not np.all(dataset.mask[pl_idx]): - statj = np.ma.array(dataset[pl_idx], mask=dataset.mask[pl_idx]) - else: - logger.debug('All vals in plev are masked, ignoring.') - statj = None - else: - mask = np.zeros_like(dataset[pl_idx], bool) - statj = np.ma.array(dataset[pl_idx], mask=mask) - - return statj - - -def _quantile(data, axis, quantile): - """Calculate quantile. + try: + operator = STATISTIC_MAPPING[statistic] + except KeyError as err: + raise ValueError( + f'Statistic `{statistic}` not supported by multicube statistics. ' + f'Must be one of {tuple(STATISTIC_MAPPING.keys())}.') from err - Workaround for calling scipy's mquantiles with arrays of >2 - dimensions Similar to iris' _percentiles function, see their - discussion: https://github.com/SciTools/iris/pull/625 - """ - # Ensure that the target axis is the last dimension. - data = np.rollaxis(data, axis, start=data.ndim) - shape = data.shape[:-1] - # Flatten any leading dimensions. - if shape: - data = data.reshape([np.prod(shape), data.shape[-1]]) - # Perform the quantile calculation. - result = scipy.stats.mstats.mquantiles(data, - quantile, - axis=-1, - alphap=1, - betap=1) - # Ensure to unflatten any leading dimensions. - if shape: - result = result.reshape(shape) - # Check whether to reduce to a scalar result - if result.shape == (1, ): - result = result[0] - - return result - - -def _compute_statistic(data, statistic_name): - """Compute multimodel statistic.""" - data = np.ma.array(data) - statistic = data[0] - - if statistic_name == 'median': - statistic_function = np.ma.median - elif statistic_name == 'mean': - statistic_function = np.ma.mean - elif statistic_name == 'std': - statistic_function = np.ma.std - elif statistic_name == 'max': - statistic_function = np.ma.max - elif statistic_name == 'min': - statistic_function = np.ma.min - elif re.match(r"^(p\d{1,2})(\.\d*)?$", statistic_name): - # percentiles between p0 and p99.99999... - quantile = float(statistic_name[1:]) / 100 - statistic_function = partial(_quantile, quantile=quantile) - else: - raise ValueError(f'No such statistic: `{statistic_name}`') - - # no plevs - if len(data[0].shape) < 3: - # get all NOT fully masked data - u_data - # data is per time point - # so we can safely NOT compute stats for single points - if data.ndim == 1: - u_datas = data - else: - u_datas = [d for d in data if not np.all(d.mask)] - if len(u_datas) > 1: - statistic = statistic_function(data, axis=0) - else: - statistic.mask = True - return statistic - - # plevs - for j in range(statistic.shape[0]): - plev_check = [] - for cdata in data: - fixed_data = _plev_fix(cdata, j) - if fixed_data is not None: - plev_check.append(fixed_data) - - # check for nr datasets - if len(plev_check) > 1: - plev_check = np.ma.array(plev_check) - statistic[j] = statistic_function(plev_check, axis=0) - else: - statistic.mask[j] = True - - return statistic - - -def _put_in_cube(template_cube, cube_data, statistic, t_axis): - """Quick cube building and saving.""" - tunits = template_cube.coord('time').units - times = iris.coords.DimCoord(t_axis, - standard_name='time', - units=tunits, - var_name='time', - long_name='time') - times.bounds = None - times.guess_bounds() - - coord_names = [c.long_name for c in template_cube.coords()] - coord_names.extend([c.standard_name for c in template_cube.coords()]) - if 'latitude' in coord_names: - lats = template_cube.coord('latitude') - else: - lats = None - if 'longitude' in coord_names: - lons = template_cube.coord('longitude') - else: - lons = None - - # no plevs - if len(template_cube.shape) == 3: - cspec = [(times, 0), (lats, 1), (lons, 2)] - # plevs - elif len(template_cube.shape) == 4: - if 'air_pressure' in coord_names: - plev = template_cube.coord('air_pressure') - elif 'altitude' in coord_names: - plev = template_cube.coord('altitude') - else: - raise ValueError(f"4D cube with dimensions {coord_names} not " - f"supported at the moment") - cspec = [(times, 0), (plev, 1), (lats, 2), (lons, 3)] - elif len(template_cube.shape) == 1: - cspec = [ - (times, 0), - ] - elif len(template_cube.shape) == 2: - # If you're going to hardwire air_pressure into this, - # might as well have depth here too. - plev = template_cube.coord('depth') - cspec = [ - (times, 0), - (plev, 1), - ] - - # correct dspec if necessary - fixed_dspec = np.ma.fix_invalid(cube_data, copy=False, fill_value=1e+20) - # put in cube - stats_cube = iris.cube.Cube(fixed_dspec, - dim_coords_and_dims=cspec, - long_name=statistic) - coord_names = [coord.name() for coord in template_cube.coords()] - if 'air_pressure' in coord_names: - if len(template_cube.shape) == 3: - stats_cube.add_aux_coord(template_cube.coord('air_pressure')) - - stats_cube.var_name = template_cube.var_name - stats_cube.long_name = template_cube.long_name - stats_cube.standard_name = template_cube.standard_name - stats_cube.units = template_cube.units - return stats_cube + return operator, kwargs def _get_consistent_time_unit(cubes): @@ -218,7 +101,6 @@ def _unify_time_coordinates(cubes): if 0 not in np.diff(years): # yearly data dates = [datetime(year, 7, 1, 0, 0, 0) for year in years] - elif 0 not in np.diff(months): # monthly data dates = [ @@ -248,115 +130,198 @@ def _unify_time_coordinates(cubes): cube.coord('time').guess_bounds() -def _get_time_slice(cubes, time): - """Fill time slice array with cubes' data if time in cube, else mask.""" - time_slice = [] - for cube in cubes: - cube_time = cube.coord('time').points - if time in cube_time: - idx = int(np.argwhere(cube_time == time)) - subset = cube.data[idx] - else: - subset = np.ma.empty(list(cube.shape[1:])) - subset.mask = True - time_slice.append(subset) - return time_slice +def _time_coords_are_aligned(cubes): + """Return `True` if time coords are aligned.""" + first_time_array = cubes[0].coord('time').points + for cube in cubes[1:]: + other_time_array = cube.coord('time').points + if not np.array_equal(first_time_array, other_time_array): + return False -def _assemble_data(cubes, statistic, span='overlap'): - """Get statistical data in iris cubes.""" - # New time array representing the union or intersection of all cubes - time_spans = [cube.coord('time').points for cube in cubes] - if span == 'overlap': - new_times = reduce(np.intersect1d, time_spans) - elif span == 'full': - new_times = reduce(np.union1d, time_spans) - n_times = len(new_times) + return True - # Target array to populate with computed statistics - new_shape = [n_times] + list(cubes[0].shape[1:]) - stats_data = np.ma.zeros(new_shape, dtype=np.dtype('float32')) - # Realize all cubes at once instead of separately for each time slice - _ = [cube.data for cube in cubes] +def _subset(cube, time_points): + """Subset cube to given time range.""" + begin = cube.coord('time').units.num2date(time_points[0]) + end = cube.coord('time').units.num2date(time_points[-1]) + constraint = iris.Constraint(time=lambda cell: begin <= cell.point <= end) + try: + return cube.extract(constraint) + except Exception as excinfo: + raise ValueError( + "Tried to align cubes in multi-model statistics, but failed for" + f" cube {cube} and time points {time_points}. Encountered the " + f"following exception: {excinfo}") - # Make time slices and compute stats - for i, time in enumerate(new_times): - time_data = _get_time_slice(cubes, time) - stats_data[i] = _compute_statistic(time_data, statistic) - template = cubes[0] - stats_cube = _put_in_cube(template, stats_data, statistic, new_times) - return stats_cube +def _extend(cube, time_points): + """Extend cube to a specified time range. + If time points are missing before the start/after the end of the + time range, cubes for each missing time pointwith masked data will + be added to pad the time range to match `time_points`. This method + supports lazy operation. + """ + cube.coord('time').bounds = None + cube_points = cube.coord('time').points -def _multicube_statistics(cubes, statistics, span): - """Compute statistics over multiple cubes. + begin = cube_points[0] + end = cube_points[-1] - Can be used e.g. for ensemble or multi-model statistics. + pad_begin = time_points[time_points < begin] + pad_end = time_points[time_points > end] - This function was designed to work on (max) four-dimensional data: - time, vertical axis, two horizontal axes. + if (len(pad_begin)) == 0 and (len(pad_end) == 0): + return cube - Apart from the time coordinate, cubes must have consistent shapes. There - are two options to combine time coordinates of different lengths, see - the `span` argument. + template_cube = cube[:1].copy() + template_cube.data = np.ma.array(da.zeros_like(template_cube.data), + mask=True, + dtype=template_cube.data.dtype) - Parameters - ---------- - cubes: list - list of cubes over which the statistics will be computed; - statistics: list - statistical metrics to be computed. Available options: mean, median, - max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional). - span: str - overlap or full; if overlap, statitsticss are computed on common time- - span; if full, statistics are computed on full time spans, ignoring - missing data. + cube_list = [] - Returns - ------- - dict - dictionary of statistics cubes with statistics' names as keys. + for time_point in pad_begin: + new_slice = template_cube.copy() + new_slice.coord('time').points = float(time_point) + cube_list.append(new_slice) - Raises - ------ - ValueError - If span is neither overlap nor full. - """ - if len(cubes) < 2: - logger.info('Found only 1 cube; no statistics computed for %r', - list(cubes)[0]) - return {statistic: cubes[0] for statistic in statistics} + cube_list.append(cube) + + for time_point in pad_end: + new_slice = template_cube.copy() + new_slice.coord('time').points = float(time_point) + cube_list.append(new_slice) - logger.debug('Multicube statistics: computing: %s', statistics) + cube_list = iris.cube.CubeList(cube_list) - # Reset time coordinates and make cubes share the same calendar + try: + new_cube = cube_list.concatenate_cube() + except Exception as excinfo: + raise ValueError( + "Tried to align cubes in multi-model statistics, but failed for" + f" cube {cube} and time points {time_points}. Encountered the " + f"following exception: {excinfo}") + + return new_cube + + +def _align(cubes, span): + """Expand or subset cubes so they share a common time span.""" _unify_time_coordinates(cubes) - # Check whether input is valid + if _time_coords_are_aligned(cubes): + return cubes + + all_time_arrays = [cube.coord('time').points for cube in cubes] + if span == 'overlap': - # check if we have any time overlap - times = [cube.coord('time').points for cube in cubes] - overlap = reduce(np.intersect1d, times) - if len(overlap) <= 1: - logger.info("Time overlap between cubes is none or a single point." - "check datasets: will not compute statistics.") - return cubes - logger.debug("Using common time overlap between " - "datasets to compute statistics.") + common_time_points = reduce(np.intersect1d, all_time_arrays) + new_cubes = [_subset(cube, common_time_points) for cube in cubes] elif span == 'full': - logger.debug("Using full time spans to compute statistics.") + all_time_points = reduce(np.union1d, all_time_arrays) + new_cubes = [_extend(cube, all_time_points) for cube in cubes] else: + raise ValueError(f"Invalid argument for span: {span!r}" + "Must be one of 'overlap', 'full'.") + + for cube in new_cubes: + # Make sure bounds exist and are consistent + cube.coord('time').bounds = None + cube.coord('time').guess_bounds() + + return new_cubes + + +def _combine(cubes): + """Merge iris cubes into a single big cube with new dimension. + + This assumes that all input cubes have the same shape. + """ + equalise_attributes(cubes) # in-place + + for i, cube in enumerate(cubes): + concat_dim = iris.coords.AuxCoord(i, var_name=CONCAT_DIM) + + cube.add_aux_coord(concat_dim) + + # Clear some metadata that can cause merge to fail + # https://scitools-iris.readthedocs.io/en/stable/userguide/ + # merge_and_concat.html#common-issues-with-merge-and-concatenate + + cube.cell_methods = None + + for coord in cube.coords(): + coord.long_name = None + coord.attributes = None + + cubes = iris.cube.CubeList(cubes) + + merged_cube = cubes.merge_cube() + + return merged_cube + + +def _compute_eager(cubes: list, *, operator: iris.analysis.Aggregator, + **kwargs): + """Compute statistics one slice at a time.""" + _ = [cube.data for cube in cubes] # make sure the cubes' data are realized + + result_slices = [] + for i in range(cubes[0].shape[0]): + single_model_slices = [cube[i] for cube in cubes] + combined_slice = _combine(single_model_slices) + collapsed_slice = combined_slice.collapsed(CONCAT_DIM, operator, + **kwargs) + + # some iris aggregators modify dtype, see e.g. + # https://numpy.org/doc/stable/reference/generated/numpy.ma.average.html + collapsed_slice.data = collapsed_slice.data.astype(np.float32) + + result_slices.append(collapsed_slice) + + try: + result_cube = iris.cube.CubeList(result_slices).merge_cube() + except Exception as excinfo: raise ValueError( - "Unexpected value for span {}, choose from 'overlap', 'full'". - format(span)) + "Multi-model statistics failed to concatenate results into a" + f" single array. This happened for operator {operator}" + f" with computed statistics {result_slices}." + "This can happen e.g. if the calculation results in inconsistent" + f" dtypes. Encountered the following exception: {excinfo}") + + result_cube.data = np.ma.array(result_cube.data) + result_cube.remove_coord(CONCAT_DIM) + return result_cube + + +def _multicube_statistics(cubes, statistics, span): + """Compute statistics over multiple cubes. + + Can be used e.g. for ensemble or multi-model statistics. + + Cubes are merged and subsequently collapsed along a new auxiliary + coordinate. Inconsistent attributes will be removed. + """ + if len(cubes) == 1: + raise ValueError('Cannot perform multicube statistics ' + 'for a single cube.') + + copied_cubes = [cube.copy() for cube in cubes] # avoid modifying inputs + aligned_cubes = _align(copied_cubes, span=span) - # Compute statistics statistics_cubes = {} for statistic in statistics: - statistic_cube = _assemble_data(cubes, statistic, span) - statistics_cubes[statistic] = statistic_cube + logger.debug('Multicube statistics: computing: %s', statistic) + operator, kwargs = _resolve_operator(statistic) + + result_cube = _compute_eager(aligned_cubes, + operator=operator, + **kwargs) + + statistics_cubes[statistic] = result_cube return statistics_cubes @@ -399,23 +364,36 @@ def multi_model_statistics(products, keep_input_datasets=True): """Compute multi-model statistics. - This function computes multi-model statistics on cubes or products. - Products (or: preprocessorfiles) are used internally by ESMValCore to store + This function computes multi-model statistics on a list of ``products``, + which can be instances of :py:class:`~iris.cube.Cube` or + :py:class:`~esmvalcore.preprocessor.PreprocessorFile`. + The latter is used internally by ESMValCore to store workflow and provenance information, and this option should typically be ignored. - This function was designed to work on (max) four-dimensional data: time, - vertical axis, two horizontal axes. Apart from the time coordinate, cubes - must have consistent shapes. There are two options to combine time - coordinates of different lengths, see the `span` argument. + Apart from the time coordinate, cubes must have consistent shapes. There + are two options to combine time coordinates of different lengths, see the + ``span`` argument. + + Uses the statistical operators in :py:mod:`iris.analysis`, including + ``mean``, ``median``, ``min``, ``max``, and ``std``. Percentiles are also + supported and can be specified like ``pXX.YY`` (for percentile ``XX.YY``; + decimal part optional). + + Notes + ----- + Some of the operators in :py:mod:`iris.analysis` require additional + arguments. Except for percentiles, these operators are currently not + supported. Parameters ---------- products: list Cubes (or products) over which the statistics will be computed. statistics: list - Statistical metrics to be computed. Available options: mean, median, - max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional). + Statistical metrics to be computed, e.g. [``mean``, ``max``]. Choose + from the operators listed in the iris.analysis package. Percentiles can + be specified like ``pXX.YY``. span: str Overlap or full; if overlap, statitstics are computed on common time- span; if full, statistics are computed on full time spans, ignoring diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py index a62498d672..485328d85b 100644 --- a/tests/sample_data/multimodel_statistics/test_multimodel.py +++ b/tests/sample_data/multimodel_statistics/test_multimodel.py @@ -1,7 +1,7 @@ """Test using sample data for :func:`esmvalcore.preprocessor._multimodel`.""" import pickle -import sys +import platform from itertools import groupby from pathlib import Path @@ -24,7 +24,11 @@ reason='Cannot calculate statistics with single cube in list')), '365_day', 'gregorian', - 'proleptic_gregorian', + pytest.param( + 'proleptic_gregorian', + marks=pytest.mark.xfail( + raises=iris.exceptions.MergeError, + reason='https://github.com/ESMValGroup/ESMValCore/issues/956')), pytest.param( 'julian', marks=pytest.mark.skip( @@ -39,7 +43,30 @@ def assert_array_almost_equal(this, other): if np.ma.isMaskedArray(this) or np.ma.isMaskedArray(other): np.testing.assert_array_equal(this.mask, other.mask) - np.testing.assert_array_almost_equal(this, other) + np.testing.assert_allclose(this, other) + + +def assert_coords_equal(this: list, other: list): + """Assert coords list `this` equals coords list `other`.""" + for this_coord, other_coord in zip(this, other): + np.testing.assert_equal(this_coord.points, other_coord.points) + assert this_coord.var_name == other_coord.var_name + assert this_coord.standard_name == other_coord.standard_name + assert this_coord.units == other_coord.units + + +def assert_metadata_equal(this, other): + """Assert metadata `this` are equal to metadata `other`.""" + assert this.standard_name == other.standard_name + assert this.long_name == other.long_name + assert this.var_name == other.var_name + assert this.units == other.units + + +def fix_metadata(cubes): + """Fix metadata.""" + for cube in cubes: + cube.coord('air_pressure').bounds = None def preprocess_data(cubes, time_slice: dict = None): @@ -67,13 +94,10 @@ def get_cache_key(value): If this doesn't avoid problems with unpickling the cached data, manually clean the pytest cache with the command `pytest --cache-clear`. """ - return ' '.join([ - str(value), - iris.__version__, - np.__version__, - sys.version, - f"rev-{TEST_REVISION}", - ]) + py_version = platform.python_version() + return (f'{value}_iris-{iris.__version__}_' + f'numpy-{np.__version__}_python-{py_version}' + f'rev-{TEST_REVISION}') @pytest.fixture(scope="module") @@ -102,6 +126,8 @@ def timeseries_cubes_month(request): request.config.cache.set(cache_key, pickle.dumps(cubes).decode('latin1')) + fix_metadata(cubes) + return cubes @@ -132,6 +158,8 @@ def timeseries_cubes_day(request): request.config.cache.set(cache_key, pickle.dumps(cubes).decode('latin1')) + fix_metadata(cubes) + def calendar(cube): return cube.coord('time').units.calendar @@ -172,17 +200,10 @@ def multimodel_regression_test(cubes, span, name): filename = Path(__file__).with_name(f'{name}-{span}-{statistic}.nc') if filename.exists(): reference_cube = iris.load_cube(str(filename)) - assert_array_almost_equal(result_cube.data, reference_cube.data) - - # Compare coords - for this_coord, other_coord in zip(result_cube.coords(), - reference_cube.coords()): - assert this_coord == other_coord - # remove Conventions which are added by Iris on save - reference_cube.attributes.pop('Conventions', None) - - assert reference_cube.metadata == result_cube.metadata + assert_array_almost_equal(result_cube.data, reference_cube.data) + assert_metadata_equal(result_cube.metadata, reference_cube.metadata) + assert_coords_equal(result_cube.coords(), reference_cube.coords()) else: # The test will fail if no regression data are available. @@ -190,6 +211,9 @@ def multimodel_regression_test(cubes, span, name): raise RuntimeError(f'Wrote reference data to {filename.absolute()}') +@pytest.mark.xfail( + raises=iris.exceptions.MergeError, + reason='https://github.com/ESMValGroup/ESMValCore/issues/956') @pytest.mark.use_sample_data @pytest.mark.parametrize('span', SPAN_PARAMS) def test_multimodel_regression_month(timeseries_cubes_month, span): @@ -228,8 +252,11 @@ def test_multimodel_no_vertical_dimension(timeseries_cubes_month): @pytest.mark.use_sample_data @pytest.mark.xfail( - 'iris.exceptions.CoordinateNotFoundError', - reason='https://github.com/ESMValGroup/ESMValCore/issues/891') + raises=iris.exceptions.MergeError, + reason='https://github.com/ESMValGroup/ESMValCore/issues/956') +# @pytest.mark.xfail( +# raises=iris.exceptions.CoordinateNotFoundError, +# reason='https://github.com/ESMValGroup/ESMValCore/issues/891') def test_multimodel_no_horizontal_dimension(timeseries_cubes_month): """Test statistic without horizontal dimension using monthly data.""" span = 'full' @@ -252,7 +279,7 @@ def test_multimodel_only_time_dimension(timeseries_cubes_month): @pytest.mark.use_sample_data @pytest.mark.xfail( - 'ValueError', + raises=ValueError, reason='https://github.com/ESMValGroup/ESMValCore/issues/890') def test_multimodel_no_time_dimension(timeseries_cubes_month): """Test statistic without time dimension using monthly data.""" diff --git a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc index c473d78f49..81e9e60a62 100644 Binary files a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc and b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc differ diff --git a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc index c473d78f49..81e9e60a62 100644 Binary files a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc and b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc differ diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py index c63d3bb39d..cca3f52e79 100644 --- a/tests/unit/preprocessor/_multimodel/test_multimodel.py +++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py @@ -2,6 +2,7 @@ from datetime import datetime +import dask.array as da import iris import numpy as np import pytest @@ -56,6 +57,7 @@ def generate_cube_from_dates( fill_val=1, len_data=3, var_name=None, + lazy=False, ): """Generate test cube from list of dates / frequency specification. @@ -80,61 +82,63 @@ def generate_cube_from_dates( if isinstance(dates, str): time = timecoord(dates, calendar, offset=offset, num=len_data) else: + len_data = len(dates) unit = Unit(offset, calendar=calendar) time = iris.coords.DimCoord(unit.date2num(dates), standard_name='time', units=unit) - return Cube((fill_val, ) * len_data, - dim_coords_and_dims=[(time, 0)], - var_name=var_name) + data = np.array((fill_val, ) * len_data, dtype=np.float32) + if lazy: + data = da.from_array(data) -def get_cubes_for_validation_test(frequency): + return Cube(data, dim_coords_and_dims=[(time, 0)], var_name=var_name) + + +def get_cubes_for_validation_test(frequency, lazy=False): """Set up cubes used for testing multimodel statistics.""" # Simple 1d cube with standard time cord - cube1 = generate_cube_from_dates(frequency) + cube1 = generate_cube_from_dates(frequency, lazy=lazy) # Cube with masked data cube2 = cube1.copy() - cube2.data = np.ma.array([5, 5, 5], mask=[True, False, False]) + data2 = np.ma.array([5, 5, 5], mask=[True, False, False], dtype=np.float32) + if lazy: + data2 = da.from_array(data2) + cube2.data = data2 # Cube with deviating time coord cube3 = generate_cube_from_dates(frequency, calendar='360_day', offset='days since 1950-01-01', len_data=2, - fill_val=9) + fill_val=9, + lazy=lazy) return [cube1, cube2, cube3] VALIDATION_DATA_SUCCESS = ( ('full', 'mean', (5, 5, 3)), - pytest.param( - 'full', - 'std', (5.656854249492381, 4, 2.8284271247461903), - marks=pytest.mark.xfail( - raises=AssertionError, - reason='https://github.com/ESMValGroup/ESMValCore/issues/1024')), + ('full', 'std_dev', (5.656854249492381, 4, 2.8284271247461903)), + ('full', 'std', (5.656854249492381, 4, 2.8284271247461903)), ('full', 'min', (1, 1, 1)), ('full', 'max', (9, 9, 5)), ('full', 'median', (5, 5, 3)), ('full', 'p50', (5, 5, 3)), ('full', 'p99.5', (8.96, 8.96, 4.98)), + ('full', 'peak', ([9], [9], [5])), ('overlap', 'mean', (5, 5)), - pytest.param( - 'full', - 'std', (5.656854249492381, 4), - marks=pytest.mark.xfail( - raises=AssertionError, - reason='https://github.com/ESMValGroup/ESMValCore/issues/1024')), + ('overlap', 'std_dev', (5.656854249492381, 4)), + ('overlap', 'std', (5.656854249492381, 4)), ('overlap', 'min', (1, 1)), ('overlap', 'max', (9, 9)), ('overlap', 'median', (5, 5)), ('overlap', 'p50', (5, 5)), ('overlap', 'p99.5', (8.96, 8.96)), + ('overlap', 'peak', ([9], [9])), # test multiple statistics ('overlap', ('min', 'max'), ((1, 1), (9, 9))), ('full', ('min', 'max'), ((1, 1, 1), (9, 9, 5))), @@ -144,14 +148,7 @@ def get_cubes_for_validation_test(frequency): @pytest.mark.parametrize('frequency', FREQUENCY_OPTIONS) @pytest.mark.parametrize('span, statistics, expected', VALIDATION_DATA_SUCCESS) def test_multimodel_statistics(frequency, span, statistics, expected): - """High level test for multicube statistics function. - - - Should work for multiple data frequencies - - Should be able to deal with multiple statistics - - Should work for both span arguments - - Should deal correctly with different mask options - - Return type should be a dict with all requested statistics as keys - """ + """High level test for multicube statistics function.""" cubes = get_cubes_for_validation_test(frequency) if isinstance(statistics, str): @@ -165,10 +162,85 @@ def test_multimodel_statistics(frequency, span, statistics, expected): for i, statistic in enumerate(statistics): result_cube = result[statistic] + # make sure that temporary coord has been removed + with pytest.raises(iris.exceptions.CoordinateNotFoundError): + result_cube.coord('multi-model') + # test that real data in => real data out + assert result_cube.has_lazy_data() is False expected_data = np.ma.array(expected[i], mask=False) assert_array_allclose(result_cube.data, expected_data) +@pytest.mark.xfail(reason='Lazy data not (yet) supported.') +@pytest.mark.parametrize('span', SPAN_OPTIONS) +def test_lazy_data_consistent_times(span): + """Test laziness of multimodel statistics with consistent time axis.""" + cubes = ( + generate_cube_from_dates('monthly', fill_val=1, lazy=True), + generate_cube_from_dates('monthly', fill_val=3, lazy=True), + generate_cube_from_dates('monthly', fill_val=6, lazy=True), + ) + + for cube in cubes: + assert cube.has_lazy_data() + + statistic = 'sum' + statistics = (statistic, ) + + result = mm._multicube_statistics(cubes, span=span, statistics=statistics) + + result_cube = result[statistic] + assert result_cube.has_lazy_data() + + +@pytest.mark.xfail(reason='Lazy data not (yet) supported.') +@pytest.mark.parametrize('span', SPAN_OPTIONS) +def test_lazy_data_inconsistent_times(span): + """Test laziness of multimodel statistics with inconsistent time axis. + + This hits `_align`, which adds additional computations which must be + lazy. + """ + + cubes = ( + generate_cube_from_dates( + [datetime(1850, i, 15, 0, 0, 0) for i in range(1, 10)], lazy=True), + generate_cube_from_dates( + [datetime(1850, i, 15, 0, 0, 0) for i in range(3, 8)], lazy=True), + generate_cube_from_dates( + [datetime(1850, i, 15, 0, 0, 0) for i in range(2, 9)], lazy=True), + ) + + for cube in cubes: + assert cube.has_lazy_data() + + statistic = 'sum' + statistics = (statistic, ) + + result = mm._multicube_statistics(cubes, span=span, statistics=statistics) + + result_cube = result[statistic] + assert result_cube.has_lazy_data() + + +VALIDATION_DATA_FAIL = ( + ('percentile', ValueError), + ('wpercentile', ValueError), + ('count', TypeError), + ('proportion', TypeError), +) + + +@pytest.mark.parametrize('statistic, error', VALIDATION_DATA_FAIL) +def test_unsupported_statistics_fail(statistic, error): + """Check that unsupported statistics raise an exception.""" + cubes = get_cubes_for_validation_test('monthly') + span = 'overlap' + statistics = (statistic, ) + with pytest.raises(error): + _ = multi_model_statistics(cubes, span, statistics) + + @pytest.mark.parametrize('calendar1, calendar2, expected', ( ('360_day', '360_day', '360_day'), ('365_day', '365_day', '365_day'), @@ -193,6 +265,91 @@ def test_get_consistent_time_unit(calendar1, calendar2, expected): assert result.calendar == expected +@pytest.mark.parametrize('span', SPAN_OPTIONS) +def test_align(span): + """Test _align function.""" + + # TODO --> check that if a cube is extended, + # the extended points are masked (not NaN!) + + len_data = 3 + + cubes = [] + + for calendar in CALENDAR_OPTIONS: + cube = generate_cube_from_dates('monthly', + calendar=calendar, + len_data=3) + cubes.append(cube) + + result_cubes = mm._align(cubes, span) + + calendars = set(cube.coord('time').units.calendar for cube in result_cubes) + + assert len(calendars) == 1 + assert list(calendars)[0] == 'gregorian' + + shapes = set(cube.shape for cube in result_cubes) + + assert len(shapes) == 1 + assert tuple(shapes)[0] == (len_data, ) + + +@pytest.mark.parametrize('span', SPAN_OPTIONS) +def test_combine_same_shape(span): + """Test _combine with same shape of cubes.""" + len_data = 3 + num_cubes = 5 + cubes = [] + + for i in range(num_cubes): + cube = generate_cube_from_dates('monthly', + '360_day', + fill_val=i, + len_data=len_data) + cubes.append(cube) + + result_cube = mm._combine(cubes) + + dim_coord = result_cube.coord(mm.CONCAT_DIM) + assert dim_coord.var_name == mm.CONCAT_DIM + assert result_cube.shape == (num_cubes, len_data) + + desired = np.linspace((0, ) * len_data, + num_cubes - 1, + num=num_cubes, + dtype=int) + np.testing.assert_equal(result_cube.data, desired) + + +def test_combine_different_shape_fail(): + """Test _combine with inconsistent data.""" + num_cubes = 5 + cubes = [] + + for num in range(1, num_cubes + 1): + cube = generate_cube_from_dates('monthly', '360_day', len_data=num) + cubes.append(cube) + + with pytest.raises(iris.exceptions.MergeError): + _ = mm._combine(cubes) + + +def test_combine_inconsistent_var_names_fail(): + """Test _combine with inconsistent var names.""" + num_cubes = 5 + cubes = [] + + for num in range(num_cubes): + cube = generate_cube_from_dates('monthly', + '360_day', + var_name=f'test_var_{num}') + cubes.append(cube) + + with pytest.raises(iris.exceptions.MergeError): + _ = mm._combine(cubes) + + @pytest.mark.parametrize('span', SPAN_OPTIONS) def test_edge_case_different_time_offsets(span): cubes = ( @@ -219,10 +376,6 @@ def test_edge_case_different_time_offsets(span): desired = np.array((14., 45., 73.)) np.testing.assert_array_equal(time_coord.points, desired) - # input cubes are updated in-place - for cube in cubes: - np.testing.assert_array_equal(cube.coord('time').points, desired) - def generate_cubes_with_non_overlapping_timecoords(): """Generate sample data where time coords do not overlap.""" @@ -296,10 +449,6 @@ def test_edge_case_time_not_in_middle_of_months(span): desired = np.array((14., 45., 73.)) np.testing.assert_array_equal(time_coord.points, desired) - # input cubes are updated in-place - for cube in cubes: - np.testing.assert_array_equal(cube.coord('time').points, desired) - @pytest.mark.parametrize('span', SPAN_OPTIONS) def test_edge_case_sub_daily_data_fail(span): @@ -314,6 +463,19 @@ def test_edge_case_sub_daily_data_fail(span): _ = multi_model_statistics(cubes, span, statistics) +@pytest.mark.parametrize('span', SPAN_OPTIONS) +def test_edge_case_single_cube_fail(span): + """Test that an error is raised when a single cube is passed.""" + cube = generate_cube_from_dates('monthly') + cubes = (cube, ) + + statistic = 'min' + statistics = (statistic, ) + + with pytest.raises(ValueError): + _ = multi_model_statistics(cubes, span, statistics) + + def test_unify_time_coordinates(): """Test set common calendar.""" cube1 = generate_cube_from_dates('monthly', @@ -360,6 +522,7 @@ def test_return_products(): result1 = mm._multiproduct_statistics(products, keep_input_datasets=True, **kwargs) + result2 = mm._multiproduct_statistics(products, keep_input_datasets=False, **kwargs)