diff --git a/esmvalcore/_recipe_checks.py b/esmvalcore/_recipe_checks.py
index 21cff6a4a9..1133a36639 100644
--- a/esmvalcore/_recipe_checks.py
+++ b/esmvalcore/_recipe_checks.py
@@ -9,7 +9,8 @@
 import yamale
 
 from ._data_finder import get_start_end_year
-from .preprocessor import PreprocessingTask, TIME_PREPROCESSORS
+from .preprocessor import TIME_PREPROCESSORS, PreprocessingTask
+from .preprocessor._multimodel import STATISTIC_MAPPING
 
 logger = logging.getLogger(__name__)
 
@@ -191,7 +192,7 @@ def extract_shape(settings):
 
 def valid_multimodel_statistic(statistic):
     """Check that `statistic` is a valid argument for multimodel stats."""
-    valid_names = ["mean", "median", "std", "min", "max"]
+    valid_names = ['std'] + list(STATISTIC_MAPPING.keys())
     valid_patterns = [r"^(p\d{1,2})(\.\d*)?$"]
     if not (statistic in valid_names
             or re.match(r'|'.join(valid_patterns), statistic)):
diff --git a/esmvalcore/preprocessor/_multimodel.py b/esmvalcore/preprocessor/_multimodel.py
index ad010ae097..8469f9177b 100644
--- a/esmvalcore/preprocessor/_multimodel.py
+++ b/esmvalcore/preprocessor/_multimodel.py
@@ -3,184 +3,67 @@
 import logging
 import re
 from datetime import datetime
-from functools import partial, reduce
+from functools import reduce
 
 import cf_units
+import dask.array as da
 import iris
 import numpy as np
-import scipy
+from iris.util import equalise_attributes
 
 logger = logging.getLogger(__name__)
 
+STATISTIC_MAPPING = {
+    'gmean': iris.analysis.GMEAN,  # not lazy in iris
+    'hmean': iris.analysis.HMEAN,  # not lazy in iris
+    'max': iris.analysis.MAX,
+    'median': iris.analysis.MEDIAN,  # not lazy in iris
+    'min': iris.analysis.MIN,
+    'rms': iris.analysis.RMS,
+    'sum': iris.analysis.SUM,
+    'mean': iris.analysis.MEAN,
+    'std_dev': iris.analysis.STD_DEV,
+    'variance': iris.analysis.VARIANCE,
+    # The following require extra kwargs,
+    # atm this is only supported for percentiles via e.g. `pXX`
+    'count': iris.analysis.COUNT,
+    'peak': iris.analysis.PEAK,
+    'percentile': iris.analysis.PERCENTILE,  # not lazy in iris
+    'proportion': iris.analysis.PROPORTION,  # not lazy in iris
+    'wpercentile': iris.analysis.WPERCENTILE,  # not lazy in iris
+}
+
+CONCAT_DIM = 'multi-model'
+
+
+def _resolve_operator(statistic: str):
+    """Find the operator corresponding to the statistic."""
+    statistic = statistic.lower()
+    kwargs = {}
+
+    # special cases
+    if statistic == 'std':
+        logger.warning(
+            "Changing statistics from specified `std` to `std_dev`, "
+            "since multimodel statistics is now using the iris.analysis module"
+            ", which also uses `std_dev`. Please consider replacing 'std' "
+            " with 'std_dev' in your recipe or code.")
+        statistic = 'std_dev'
+
+    elif re.match(r"^(p\d{1,2})(\.\d*)?$", statistic):
+        # percentiles between p0 and p99.99999...
+        percentile = float(statistic[1:])
+        kwargs['percent'] = percentile
+        statistic = 'percentile'
 
-def _plev_fix(dataset, pl_idx):
-    """Extract valid plev data.
-
-    This function takes care of situations in which certain plevs are
-    completely masked due to unavailable interpolation boundaries.
-    """
-    if np.ma.is_masked(dataset):
-        # keep only the valid plevs
-        if not np.all(dataset.mask[pl_idx]):
-            statj = np.ma.array(dataset[pl_idx], mask=dataset.mask[pl_idx])
-        else:
-            logger.debug('All vals in plev are masked, ignoring.')
-            statj = None
-    else:
-        mask = np.zeros_like(dataset[pl_idx], bool)
-        statj = np.ma.array(dataset[pl_idx], mask=mask)
-
-    return statj
-
-
-def _quantile(data, axis, quantile):
-    """Calculate quantile.
+    try:
+        operator = STATISTIC_MAPPING[statistic]
+    except KeyError as err:
+        raise ValueError(
+            f'Statistic `{statistic}` not supported by multicube statistics. '
+            f'Must be one of {tuple(STATISTIC_MAPPING.keys())}.') from err
 
-    Workaround for calling scipy's mquantiles with arrays of >2
-    dimensions Similar to iris' _percentiles function, see their
-    discussion: https://github.com/SciTools/iris/pull/625
-    """
-    # Ensure that the target axis is the last dimension.
-    data = np.rollaxis(data, axis, start=data.ndim)
-    shape = data.shape[:-1]
-    # Flatten any leading dimensions.
-    if shape:
-        data = data.reshape([np.prod(shape), data.shape[-1]])
-    # Perform the quantile calculation.
-    result = scipy.stats.mstats.mquantiles(data,
-                                           quantile,
-                                           axis=-1,
-                                           alphap=1,
-                                           betap=1)
-    # Ensure to unflatten any leading dimensions.
-    if shape:
-        result = result.reshape(shape)
-    # Check whether to reduce to a scalar result
-    if result.shape == (1, ):
-        result = result[0]
-
-    return result
-
-
-def _compute_statistic(data, statistic_name):
-    """Compute multimodel statistic."""
-    data = np.ma.array(data)
-    statistic = data[0]
-
-    if statistic_name == 'median':
-        statistic_function = np.ma.median
-    elif statistic_name == 'mean':
-        statistic_function = np.ma.mean
-    elif statistic_name == 'std':
-        statistic_function = np.ma.std
-    elif statistic_name == 'max':
-        statistic_function = np.ma.max
-    elif statistic_name == 'min':
-        statistic_function = np.ma.min
-    elif re.match(r"^(p\d{1,2})(\.\d*)?$", statistic_name):
-        # percentiles between p0 and p99.99999...
-        quantile = float(statistic_name[1:]) / 100
-        statistic_function = partial(_quantile, quantile=quantile)
-    else:
-        raise ValueError(f'No such statistic: `{statistic_name}`')
-
-    # no plevs
-    if len(data[0].shape) < 3:
-        # get all NOT fully masked data - u_data
-        # data is per time point
-        # so we can safely NOT compute stats for single points
-        if data.ndim == 1:
-            u_datas = data
-        else:
-            u_datas = [d for d in data if not np.all(d.mask)]
-        if len(u_datas) > 1:
-            statistic = statistic_function(data, axis=0)
-        else:
-            statistic.mask = True
-        return statistic
-
-    # plevs
-    for j in range(statistic.shape[0]):
-        plev_check = []
-        for cdata in data:
-            fixed_data = _plev_fix(cdata, j)
-            if fixed_data is not None:
-                plev_check.append(fixed_data)
-
-        # check for nr datasets
-        if len(plev_check) > 1:
-            plev_check = np.ma.array(plev_check)
-            statistic[j] = statistic_function(plev_check, axis=0)
-        else:
-            statistic.mask[j] = True
-
-    return statistic
-
-
-def _put_in_cube(template_cube, cube_data, statistic, t_axis):
-    """Quick cube building and saving."""
-    tunits = template_cube.coord('time').units
-    times = iris.coords.DimCoord(t_axis,
-                                 standard_name='time',
-                                 units=tunits,
-                                 var_name='time',
-                                 long_name='time')
-    times.bounds = None
-    times.guess_bounds()
-
-    coord_names = [c.long_name for c in template_cube.coords()]
-    coord_names.extend([c.standard_name for c in template_cube.coords()])
-    if 'latitude' in coord_names:
-        lats = template_cube.coord('latitude')
-    else:
-        lats = None
-    if 'longitude' in coord_names:
-        lons = template_cube.coord('longitude')
-    else:
-        lons = None
-
-    # no plevs
-    if len(template_cube.shape) == 3:
-        cspec = [(times, 0), (lats, 1), (lons, 2)]
-    # plevs
-    elif len(template_cube.shape) == 4:
-        if 'air_pressure' in coord_names:
-            plev = template_cube.coord('air_pressure')
-        elif 'altitude' in coord_names:
-            plev = template_cube.coord('altitude')
-        else:
-            raise ValueError(f"4D cube with dimensions {coord_names} not "
-                             f"supported at the moment")
-        cspec = [(times, 0), (plev, 1), (lats, 2), (lons, 3)]
-    elif len(template_cube.shape) == 1:
-        cspec = [
-            (times, 0),
-        ]
-    elif len(template_cube.shape) == 2:
-        # If you're going to hardwire air_pressure into this,
-        # might as well have depth here too.
-        plev = template_cube.coord('depth')
-        cspec = [
-            (times, 0),
-            (plev, 1),
-        ]
-
-    # correct dspec if necessary
-    fixed_dspec = np.ma.fix_invalid(cube_data, copy=False, fill_value=1e+20)
-    # put in cube
-    stats_cube = iris.cube.Cube(fixed_dspec,
-                                dim_coords_and_dims=cspec,
-                                long_name=statistic)
-    coord_names = [coord.name() for coord in template_cube.coords()]
-    if 'air_pressure' in coord_names:
-        if len(template_cube.shape) == 3:
-            stats_cube.add_aux_coord(template_cube.coord('air_pressure'))
-
-    stats_cube.var_name = template_cube.var_name
-    stats_cube.long_name = template_cube.long_name
-    stats_cube.standard_name = template_cube.standard_name
-    stats_cube.units = template_cube.units
-    return stats_cube
+    return operator, kwargs
 
 
 def _get_consistent_time_unit(cubes):
@@ -218,7 +101,6 @@ def _unify_time_coordinates(cubes):
         if 0 not in np.diff(years):
             # yearly data
             dates = [datetime(year, 7, 1, 0, 0, 0) for year in years]
-
         elif 0 not in np.diff(months):
             # monthly data
             dates = [
@@ -248,115 +130,198 @@ def _unify_time_coordinates(cubes):
         cube.coord('time').guess_bounds()
 
 
-def _get_time_slice(cubes, time):
-    """Fill time slice array with cubes' data if time in cube, else mask."""
-    time_slice = []
-    for cube in cubes:
-        cube_time = cube.coord('time').points
-        if time in cube_time:
-            idx = int(np.argwhere(cube_time == time))
-            subset = cube.data[idx]
-        else:
-            subset = np.ma.empty(list(cube.shape[1:]))
-            subset.mask = True
-        time_slice.append(subset)
-    return time_slice
+def _time_coords_are_aligned(cubes):
+    """Return `True` if time coords are aligned."""
+    first_time_array = cubes[0].coord('time').points
 
+    for cube in cubes[1:]:
+        other_time_array = cube.coord('time').points
+        if not np.array_equal(first_time_array, other_time_array):
+            return False
 
-def _assemble_data(cubes, statistic, span='overlap'):
-    """Get statistical data in iris cubes."""
-    # New time array representing the union or intersection of all cubes
-    time_spans = [cube.coord('time').points for cube in cubes]
-    if span == 'overlap':
-        new_times = reduce(np.intersect1d, time_spans)
-    elif span == 'full':
-        new_times = reduce(np.union1d, time_spans)
-    n_times = len(new_times)
+    return True
 
-    # Target array to populate with computed statistics
-    new_shape = [n_times] + list(cubes[0].shape[1:])
-    stats_data = np.ma.zeros(new_shape, dtype=np.dtype('float32'))
 
-    # Realize all cubes at once instead of separately for each time slice
-    _ = [cube.data for cube in cubes]
+def _subset(cube, time_points):
+    """Subset cube to given time range."""
+    begin = cube.coord('time').units.num2date(time_points[0])
+    end = cube.coord('time').units.num2date(time_points[-1])
+    constraint = iris.Constraint(time=lambda cell: begin <= cell.point <= end)
+    try:
+        return cube.extract(constraint)
+    except Exception as excinfo:
+        raise ValueError(
+            "Tried to align cubes in multi-model statistics, but failed for"
+            f" cube {cube} and time points {time_points}. Encountered the "
+            f"following exception: {excinfo}")
 
-    # Make time slices and compute stats
-    for i, time in enumerate(new_times):
-        time_data = _get_time_slice(cubes, time)
-        stats_data[i] = _compute_statistic(time_data, statistic)
 
-    template = cubes[0]
-    stats_cube = _put_in_cube(template, stats_data, statistic, new_times)
-    return stats_cube
+def _extend(cube, time_points):
+    """Extend cube to a specified time range.
 
+    If time points are missing before the start/after the end of the
+    time range, cubes for each missing time pointwith masked data will
+    be added to pad the time range to match `time_points`. This method
+    supports lazy operation.
+    """
+    cube.coord('time').bounds = None
+    cube_points = cube.coord('time').points
 
-def _multicube_statistics(cubes, statistics, span):
-    """Compute statistics over multiple cubes.
+    begin = cube_points[0]
+    end = cube_points[-1]
 
-    Can be used e.g. for ensemble or multi-model statistics.
+    pad_begin = time_points[time_points < begin]
+    pad_end = time_points[time_points > end]
 
-    This function was designed to work on (max) four-dimensional data:
-    time, vertical axis, two horizontal axes.
+    if (len(pad_begin)) == 0 and (len(pad_end) == 0):
+        return cube
 
-    Apart from the time coordinate, cubes must have consistent shapes. There
-    are two options to combine time coordinates of different lengths, see
-    the `span` argument.
+    template_cube = cube[:1].copy()
+    template_cube.data = np.ma.array(da.zeros_like(template_cube.data),
+                                     mask=True,
+                                     dtype=template_cube.data.dtype)
 
-    Parameters
-    ----------
-    cubes: list
-        list of cubes over which the statistics will be computed;
-    statistics: list
-        statistical metrics to be computed. Available options: mean, median,
-        max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional).
-    span: str
-        overlap or full; if overlap, statitsticss are computed on common time-
-        span; if full, statistics are computed on full time spans, ignoring
-        missing data.
+    cube_list = []
 
-    Returns
-    -------
-    dict
-        dictionary of statistics cubes with statistics' names as keys.
+    for time_point in pad_begin:
+        new_slice = template_cube.copy()
+        new_slice.coord('time').points = float(time_point)
+        cube_list.append(new_slice)
 
-    Raises
-    ------
-    ValueError
-        If span is neither overlap nor full.
-    """
-    if len(cubes) < 2:
-        logger.info('Found only 1 cube; no statistics computed for %r',
-                    list(cubes)[0])
-        return {statistic: cubes[0] for statistic in statistics}
+    cube_list.append(cube)
+
+    for time_point in pad_end:
+        new_slice = template_cube.copy()
+        new_slice.coord('time').points = float(time_point)
+        cube_list.append(new_slice)
 
-    logger.debug('Multicube statistics: computing: %s', statistics)
+    cube_list = iris.cube.CubeList(cube_list)
 
-    # Reset time coordinates and make cubes share the same calendar
+    try:
+        new_cube = cube_list.concatenate_cube()
+    except Exception as excinfo:
+        raise ValueError(
+            "Tried to align cubes in multi-model statistics, but failed for"
+            f" cube {cube} and time points {time_points}. Encountered the "
+            f"following exception: {excinfo}")
+
+    return new_cube
+
+
+def _align(cubes, span):
+    """Expand or subset cubes so they share a common time span."""
     _unify_time_coordinates(cubes)
 
-    # Check whether input is valid
+    if _time_coords_are_aligned(cubes):
+        return cubes
+
+    all_time_arrays = [cube.coord('time').points for cube in cubes]
+
     if span == 'overlap':
-        # check if we have any time overlap
-        times = [cube.coord('time').points for cube in cubes]
-        overlap = reduce(np.intersect1d, times)
-        if len(overlap) <= 1:
-            logger.info("Time overlap between cubes is none or a single point."
-                        "check datasets: will not compute statistics.")
-            return cubes
-        logger.debug("Using common time overlap between "
-                     "datasets to compute statistics.")
+        common_time_points = reduce(np.intersect1d, all_time_arrays)
+        new_cubes = [_subset(cube, common_time_points) for cube in cubes]
     elif span == 'full':
-        logger.debug("Using full time spans to compute statistics.")
+        all_time_points = reduce(np.union1d, all_time_arrays)
+        new_cubes = [_extend(cube, all_time_points) for cube in cubes]
     else:
+        raise ValueError(f"Invalid argument for span: {span!r}"
+                         "Must be one of 'overlap', 'full'.")
+
+    for cube in new_cubes:
+        # Make sure bounds exist and are consistent
+        cube.coord('time').bounds = None
+        cube.coord('time').guess_bounds()
+
+    return new_cubes
+
+
+def _combine(cubes):
+    """Merge iris cubes into a single big cube with new dimension.
+
+    This assumes that all input cubes have the same shape.
+    """
+    equalise_attributes(cubes)  # in-place
+
+    for i, cube in enumerate(cubes):
+        concat_dim = iris.coords.AuxCoord(i, var_name=CONCAT_DIM)
+
+        cube.add_aux_coord(concat_dim)
+
+        # Clear some metadata that can cause merge to fail
+        # https://scitools-iris.readthedocs.io/en/stable/userguide/
+        #    merge_and_concat.html#common-issues-with-merge-and-concatenate
+
+        cube.cell_methods = None
+
+        for coord in cube.coords():
+            coord.long_name = None
+            coord.attributes = None
+
+    cubes = iris.cube.CubeList(cubes)
+
+    merged_cube = cubes.merge_cube()
+
+    return merged_cube
+
+
+def _compute_eager(cubes: list, *, operator: iris.analysis.Aggregator,
+                   **kwargs):
+    """Compute statistics one slice at a time."""
+    _ = [cube.data for cube in cubes]  # make sure the cubes' data are realized
+
+    result_slices = []
+    for i in range(cubes[0].shape[0]):
+        single_model_slices = [cube[i] for cube in cubes]
+        combined_slice = _combine(single_model_slices)
+        collapsed_slice = combined_slice.collapsed(CONCAT_DIM, operator,
+                                                   **kwargs)
+
+        # some iris aggregators modify dtype, see e.g.
+        # https://numpy.org/doc/stable/reference/generated/numpy.ma.average.html
+        collapsed_slice.data = collapsed_slice.data.astype(np.float32)
+
+        result_slices.append(collapsed_slice)
+
+    try:
+        result_cube = iris.cube.CubeList(result_slices).merge_cube()
+    except Exception as excinfo:
         raise ValueError(
-            "Unexpected value for span {}, choose from 'overlap', 'full'".
-            format(span))
+            "Multi-model statistics failed to concatenate results into a"
+            f" single array. This happened for operator {operator}"
+            f" with computed statistics {result_slices}."
+            "This can happen e.g. if the calculation results in inconsistent"
+            f" dtypes. Encountered the following exception: {excinfo}")
+
+    result_cube.data = np.ma.array(result_cube.data)
+    result_cube.remove_coord(CONCAT_DIM)
+    return result_cube
+
+
+def _multicube_statistics(cubes, statistics, span):
+    """Compute statistics over multiple cubes.
+
+    Can be used e.g. for ensemble or multi-model statistics.
+
+    Cubes are merged and subsequently collapsed along a new auxiliary
+    coordinate. Inconsistent attributes will be removed.
+    """
+    if len(cubes) == 1:
+        raise ValueError('Cannot perform multicube statistics '
+                         'for a single cube.')
+
+    copied_cubes = [cube.copy() for cube in cubes]  # avoid modifying inputs
+    aligned_cubes = _align(copied_cubes, span=span)
 
-    # Compute statistics
     statistics_cubes = {}
     for statistic in statistics:
-        statistic_cube = _assemble_data(cubes, statistic, span)
-        statistics_cubes[statistic] = statistic_cube
+        logger.debug('Multicube statistics: computing: %s', statistic)
+        operator, kwargs = _resolve_operator(statistic)
+
+        result_cube = _compute_eager(aligned_cubes,
+                                     operator=operator,
+                                     **kwargs)
+
+        statistics_cubes[statistic] = result_cube
 
     return statistics_cubes
 
@@ -399,23 +364,36 @@ def multi_model_statistics(products,
                            keep_input_datasets=True):
     """Compute multi-model statistics.
 
-    This function computes multi-model statistics on cubes or products.
-    Products (or: preprocessorfiles) are used internally by ESMValCore to store
+    This function computes multi-model statistics on a list of ``products``,
+    which can be instances of :py:class:`~iris.cube.Cube` or
+    :py:class:`~esmvalcore.preprocessor.PreprocessorFile`.
+    The latter is used internally by ESMValCore to store
     workflow and provenance information, and this option should typically be
     ignored.
 
-    This function was designed to work on (max) four-dimensional data: time,
-    vertical axis, two horizontal axes. Apart from the time coordinate, cubes
-    must have consistent shapes. There are two options to combine time
-    coordinates of different lengths, see the `span` argument.
+    Apart from the time coordinate, cubes must have consistent shapes. There
+    are two options to combine time coordinates of different lengths, see the
+    ``span`` argument.
+
+    Uses the statistical operators in :py:mod:`iris.analysis`, including
+    ``mean``, ``median``, ``min``, ``max``, and ``std``. Percentiles are also
+    supported and can be specified like ``pXX.YY`` (for percentile ``XX.YY``;
+    decimal part optional).
+
+    Notes
+    -----
+    Some of the operators in :py:mod:`iris.analysis` require additional
+    arguments. Except for percentiles, these operators are currently not
+    supported.
 
     Parameters
     ----------
     products: list
         Cubes (or products) over which the statistics will be computed.
     statistics: list
-        Statistical metrics to be computed. Available options: mean, median,
-        max, min, std, or pXX.YY (for percentile XX.YY; decimal part optional).
+        Statistical metrics to be computed, e.g. [``mean``, ``max``]. Choose
+        from the operators listed in the iris.analysis package. Percentiles can
+        be specified like ``pXX.YY``.
     span: str
         Overlap or full; if overlap, statitstics are computed on common time-
         span; if full, statistics are computed on full time spans, ignoring
diff --git a/tests/sample_data/multimodel_statistics/test_multimodel.py b/tests/sample_data/multimodel_statistics/test_multimodel.py
index a62498d672..485328d85b 100644
--- a/tests/sample_data/multimodel_statistics/test_multimodel.py
+++ b/tests/sample_data/multimodel_statistics/test_multimodel.py
@@ -1,7 +1,7 @@
 """Test using sample data for :func:`esmvalcore.preprocessor._multimodel`."""
 
 import pickle
-import sys
+import platform
 from itertools import groupby
 from pathlib import Path
 
@@ -24,7 +24,11 @@
             reason='Cannot calculate statistics with single cube in list')),
     '365_day',
     'gregorian',
-    'proleptic_gregorian',
+    pytest.param(
+        'proleptic_gregorian',
+        marks=pytest.mark.xfail(
+            raises=iris.exceptions.MergeError,
+            reason='https://github.com/ESMValGroup/ESMValCore/issues/956')),
     pytest.param(
         'julian',
         marks=pytest.mark.skip(
@@ -39,7 +43,30 @@ def assert_array_almost_equal(this, other):
     if np.ma.isMaskedArray(this) or np.ma.isMaskedArray(other):
         np.testing.assert_array_equal(this.mask, other.mask)
 
-    np.testing.assert_array_almost_equal(this, other)
+    np.testing.assert_allclose(this, other)
+
+
+def assert_coords_equal(this: list, other: list):
+    """Assert coords list `this` equals coords list `other`."""
+    for this_coord, other_coord in zip(this, other):
+        np.testing.assert_equal(this_coord.points, other_coord.points)
+        assert this_coord.var_name == other_coord.var_name
+        assert this_coord.standard_name == other_coord.standard_name
+        assert this_coord.units == other_coord.units
+
+
+def assert_metadata_equal(this, other):
+    """Assert metadata `this` are equal to metadata `other`."""
+    assert this.standard_name == other.standard_name
+    assert this.long_name == other.long_name
+    assert this.var_name == other.var_name
+    assert this.units == other.units
+
+
+def fix_metadata(cubes):
+    """Fix metadata."""
+    for cube in cubes:
+        cube.coord('air_pressure').bounds = None
 
 
 def preprocess_data(cubes, time_slice: dict = None):
@@ -67,13 +94,10 @@ def get_cache_key(value):
     If this doesn't avoid problems with unpickling the cached data,
     manually clean the pytest cache with the command `pytest --cache-clear`.
     """
-    return ' '.join([
-        str(value),
-        iris.__version__,
-        np.__version__,
-        sys.version,
-        f"rev-{TEST_REVISION}",
-    ])
+    py_version = platform.python_version()
+    return (f'{value}_iris-{iris.__version__}_'
+            f'numpy-{np.__version__}_python-{py_version}'
+            f'rev-{TEST_REVISION}')
 
 
 @pytest.fixture(scope="module")
@@ -102,6 +126,8 @@ def timeseries_cubes_month(request):
         request.config.cache.set(cache_key,
                                  pickle.dumps(cubes).decode('latin1'))
 
+    fix_metadata(cubes)
+
     return cubes
 
 
@@ -132,6 +158,8 @@ def timeseries_cubes_day(request):
         request.config.cache.set(cache_key,
                                  pickle.dumps(cubes).decode('latin1'))
 
+    fix_metadata(cubes)
+
     def calendar(cube):
         return cube.coord('time').units.calendar
 
@@ -172,17 +200,10 @@ def multimodel_regression_test(cubes, span, name):
     filename = Path(__file__).with_name(f'{name}-{span}-{statistic}.nc')
     if filename.exists():
         reference_cube = iris.load_cube(str(filename))
-        assert_array_almost_equal(result_cube.data, reference_cube.data)
-
-        # Compare coords
-        for this_coord, other_coord in zip(result_cube.coords(),
-                                           reference_cube.coords()):
-            assert this_coord == other_coord
 
-        # remove Conventions which are added by Iris on save
-        reference_cube.attributes.pop('Conventions', None)
-
-        assert reference_cube.metadata == result_cube.metadata
+        assert_array_almost_equal(result_cube.data, reference_cube.data)
+        assert_metadata_equal(result_cube.metadata, reference_cube.metadata)
+        assert_coords_equal(result_cube.coords(), reference_cube.coords())
 
     else:
         # The test will fail if no regression data are available.
@@ -190,6 +211,9 @@ def multimodel_regression_test(cubes, span, name):
         raise RuntimeError(f'Wrote reference data to {filename.absolute()}')
 
 
+@pytest.mark.xfail(
+    raises=iris.exceptions.MergeError,
+    reason='https://github.com/ESMValGroup/ESMValCore/issues/956')
 @pytest.mark.use_sample_data
 @pytest.mark.parametrize('span', SPAN_PARAMS)
 def test_multimodel_regression_month(timeseries_cubes_month, span):
@@ -228,8 +252,11 @@ def test_multimodel_no_vertical_dimension(timeseries_cubes_month):
 
 @pytest.mark.use_sample_data
 @pytest.mark.xfail(
-    'iris.exceptions.CoordinateNotFoundError',
-    reason='https://github.com/ESMValGroup/ESMValCore/issues/891')
+    raises=iris.exceptions.MergeError,
+    reason='https://github.com/ESMValGroup/ESMValCore/issues/956')
+# @pytest.mark.xfail(
+#     raises=iris.exceptions.CoordinateNotFoundError,
+#     reason='https://github.com/ESMValGroup/ESMValCore/issues/891')
 def test_multimodel_no_horizontal_dimension(timeseries_cubes_month):
     """Test statistic without horizontal dimension using monthly data."""
     span = 'full'
@@ -252,7 +279,7 @@ def test_multimodel_only_time_dimension(timeseries_cubes_month):
 
 @pytest.mark.use_sample_data
 @pytest.mark.xfail(
-    'ValueError',
+    raises=ValueError,
     reason='https://github.com/ESMValGroup/ESMValCore/issues/890')
 def test_multimodel_no_time_dimension(timeseries_cubes_month):
     """Test statistic without time dimension using monthly data."""
diff --git a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc
index c473d78f49..81e9e60a62 100644
Binary files a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc and b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-full-mean.nc differ
diff --git a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc
index c473d78f49..81e9e60a62 100644
Binary files a/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc and b/tests/sample_data/multimodel_statistics/timeseries_daily_365_day-overlap-mean.nc differ
diff --git a/tests/unit/preprocessor/_multimodel/test_multimodel.py b/tests/unit/preprocessor/_multimodel/test_multimodel.py
index c63d3bb39d..cca3f52e79 100644
--- a/tests/unit/preprocessor/_multimodel/test_multimodel.py
+++ b/tests/unit/preprocessor/_multimodel/test_multimodel.py
@@ -2,6 +2,7 @@
 
 from datetime import datetime
 
+import dask.array as da
 import iris
 import numpy as np
 import pytest
@@ -56,6 +57,7 @@ def generate_cube_from_dates(
     fill_val=1,
     len_data=3,
     var_name=None,
+    lazy=False,
 ):
     """Generate test cube from list of dates / frequency specification.
 
@@ -80,61 +82,63 @@ def generate_cube_from_dates(
     if isinstance(dates, str):
         time = timecoord(dates, calendar, offset=offset, num=len_data)
     else:
+        len_data = len(dates)
         unit = Unit(offset, calendar=calendar)
         time = iris.coords.DimCoord(unit.date2num(dates),
                                     standard_name='time',
                                     units=unit)
 
-    return Cube((fill_val, ) * len_data,
-                dim_coords_and_dims=[(time, 0)],
-                var_name=var_name)
+    data = np.array((fill_val, ) * len_data, dtype=np.float32)
 
+    if lazy:
+        data = da.from_array(data)
 
-def get_cubes_for_validation_test(frequency):
+    return Cube(data, dim_coords_and_dims=[(time, 0)], var_name=var_name)
+
+
+def get_cubes_for_validation_test(frequency, lazy=False):
     """Set up cubes used for testing multimodel statistics."""
 
     # Simple 1d cube with standard time cord
-    cube1 = generate_cube_from_dates(frequency)
+    cube1 = generate_cube_from_dates(frequency, lazy=lazy)
 
     # Cube with masked data
     cube2 = cube1.copy()
-    cube2.data = np.ma.array([5, 5, 5], mask=[True, False, False])
+    data2 = np.ma.array([5, 5, 5], mask=[True, False, False], dtype=np.float32)
+    if lazy:
+        data2 = da.from_array(data2)
+    cube2.data = data2
 
     # Cube with deviating time coord
     cube3 = generate_cube_from_dates(frequency,
                                      calendar='360_day',
                                      offset='days since 1950-01-01',
                                      len_data=2,
-                                     fill_val=9)
+                                     fill_val=9,
+                                     lazy=lazy)
 
     return [cube1, cube2, cube3]
 
 
 VALIDATION_DATA_SUCCESS = (
     ('full', 'mean', (5, 5, 3)),
-    pytest.param(
-        'full',
-        'std', (5.656854249492381, 4, 2.8284271247461903),
-        marks=pytest.mark.xfail(
-            raises=AssertionError,
-            reason='https://github.com/ESMValGroup/ESMValCore/issues/1024')),
+    ('full', 'std_dev', (5.656854249492381, 4, 2.8284271247461903)),
+    ('full', 'std', (5.656854249492381, 4, 2.8284271247461903)),
     ('full', 'min', (1, 1, 1)),
     ('full', 'max', (9, 9, 5)),
     ('full', 'median', (5, 5, 3)),
     ('full', 'p50', (5, 5, 3)),
     ('full', 'p99.5', (8.96, 8.96, 4.98)),
+    ('full', 'peak', ([9], [9], [5])),
     ('overlap', 'mean', (5, 5)),
-    pytest.param(
-        'full',
-        'std', (5.656854249492381, 4),
-        marks=pytest.mark.xfail(
-            raises=AssertionError,
-            reason='https://github.com/ESMValGroup/ESMValCore/issues/1024')),
+    ('overlap', 'std_dev', (5.656854249492381, 4)),
+    ('overlap', 'std', (5.656854249492381, 4)),
     ('overlap', 'min', (1, 1)),
     ('overlap', 'max', (9, 9)),
     ('overlap', 'median', (5, 5)),
     ('overlap', 'p50', (5, 5)),
     ('overlap', 'p99.5', (8.96, 8.96)),
+    ('overlap', 'peak', ([9], [9])),
     # test multiple statistics
     ('overlap', ('min', 'max'), ((1, 1), (9, 9))),
     ('full', ('min', 'max'), ((1, 1, 1), (9, 9, 5))),
@@ -144,14 +148,7 @@ def get_cubes_for_validation_test(frequency):
 @pytest.mark.parametrize('frequency', FREQUENCY_OPTIONS)
 @pytest.mark.parametrize('span, statistics, expected', VALIDATION_DATA_SUCCESS)
 def test_multimodel_statistics(frequency, span, statistics, expected):
-    """High level test for multicube statistics function.
-
-    - Should work for multiple data frequencies
-    - Should be able to deal with multiple statistics
-    - Should work for both span arguments
-    - Should deal correctly with different mask options
-    - Return type should be a dict with all requested statistics as keys
-    """
+    """High level test for multicube statistics function."""
     cubes = get_cubes_for_validation_test(frequency)
 
     if isinstance(statistics, str):
@@ -165,10 +162,85 @@ def test_multimodel_statistics(frequency, span, statistics, expected):
 
     for i, statistic in enumerate(statistics):
         result_cube = result[statistic]
+        # make sure that temporary coord has been removed
+        with pytest.raises(iris.exceptions.CoordinateNotFoundError):
+            result_cube.coord('multi-model')
+        # test that real data in => real data out
+        assert result_cube.has_lazy_data() is False
         expected_data = np.ma.array(expected[i], mask=False)
         assert_array_allclose(result_cube.data, expected_data)
 
 
+@pytest.mark.xfail(reason='Lazy data not (yet) supported.')
+@pytest.mark.parametrize('span', SPAN_OPTIONS)
+def test_lazy_data_consistent_times(span):
+    """Test laziness of multimodel statistics with consistent time axis."""
+    cubes = (
+        generate_cube_from_dates('monthly', fill_val=1, lazy=True),
+        generate_cube_from_dates('monthly', fill_val=3, lazy=True),
+        generate_cube_from_dates('monthly', fill_val=6, lazy=True),
+    )
+
+    for cube in cubes:
+        assert cube.has_lazy_data()
+
+    statistic = 'sum'
+    statistics = (statistic, )
+
+    result = mm._multicube_statistics(cubes, span=span, statistics=statistics)
+
+    result_cube = result[statistic]
+    assert result_cube.has_lazy_data()
+
+
+@pytest.mark.xfail(reason='Lazy data not (yet) supported.')
+@pytest.mark.parametrize('span', SPAN_OPTIONS)
+def test_lazy_data_inconsistent_times(span):
+    """Test laziness of multimodel statistics with inconsistent time axis.
+
+    This hits `_align`, which adds additional computations which must be
+    lazy.
+    """
+
+    cubes = (
+        generate_cube_from_dates(
+            [datetime(1850, i, 15, 0, 0, 0) for i in range(1, 10)], lazy=True),
+        generate_cube_from_dates(
+            [datetime(1850, i, 15, 0, 0, 0) for i in range(3, 8)], lazy=True),
+        generate_cube_from_dates(
+            [datetime(1850, i, 15, 0, 0, 0) for i in range(2, 9)], lazy=True),
+    )
+
+    for cube in cubes:
+        assert cube.has_lazy_data()
+
+    statistic = 'sum'
+    statistics = (statistic, )
+
+    result = mm._multicube_statistics(cubes, span=span, statistics=statistics)
+
+    result_cube = result[statistic]
+    assert result_cube.has_lazy_data()
+
+
+VALIDATION_DATA_FAIL = (
+    ('percentile', ValueError),
+    ('wpercentile', ValueError),
+    ('count', TypeError),
+    ('proportion', TypeError),
+)
+
+
+@pytest.mark.parametrize('statistic, error', VALIDATION_DATA_FAIL)
+def test_unsupported_statistics_fail(statistic, error):
+    """Check that unsupported statistics raise an exception."""
+    cubes = get_cubes_for_validation_test('monthly')
+    span = 'overlap'
+    statistics = (statistic, )
+    with pytest.raises(error):
+        _ = multi_model_statistics(cubes, span, statistics)
+
+
 @pytest.mark.parametrize('calendar1, calendar2, expected', (
     ('360_day', '360_day', '360_day'),
     ('365_day', '365_day', '365_day'),
@@ -193,6 +265,91 @@ def test_get_consistent_time_unit(calendar1, calendar2, expected):
     assert result.calendar == expected
 
 
+@pytest.mark.parametrize('span', SPAN_OPTIONS)
+def test_align(span):
+    """Test _align function."""
+
+    # TODO --> check that if a cube is extended,
+    #          the extended points are masked (not NaN!)
+
+    len_data = 3
+
+    cubes = []
+
+    for calendar in CALENDAR_OPTIONS:
+        cube = generate_cube_from_dates('monthly',
+                                        calendar=calendar,
+                                        len_data=3)
+        cubes.append(cube)
+
+    result_cubes = mm._align(cubes, span)
+
+    calendars = set(cube.coord('time').units.calendar for cube in result_cubes)
+
+    assert len(calendars) == 1
+    assert list(calendars)[0] == 'gregorian'
+
+    shapes = set(cube.shape for cube in result_cubes)
+
+    assert len(shapes) == 1
+    assert tuple(shapes)[0] == (len_data, )
+
+
+@pytest.mark.parametrize('span', SPAN_OPTIONS)
+def test_combine_same_shape(span):
+    """Test _combine with same shape of cubes."""
+    len_data = 3
+    num_cubes = 5
+    cubes = []
+
+    for i in range(num_cubes):
+        cube = generate_cube_from_dates('monthly',
+                                        '360_day',
+                                        fill_val=i,
+                                        len_data=len_data)
+        cubes.append(cube)
+
+    result_cube = mm._combine(cubes)
+
+    dim_coord = result_cube.coord(mm.CONCAT_DIM)
+    assert dim_coord.var_name == mm.CONCAT_DIM
+    assert result_cube.shape == (num_cubes, len_data)
+
+    desired = np.linspace((0, ) * len_data,
+                          num_cubes - 1,
+                          num=num_cubes,
+                          dtype=int)
+    np.testing.assert_equal(result_cube.data, desired)
+
+
+def test_combine_different_shape_fail():
+    """Test _combine with inconsistent data."""
+    num_cubes = 5
+    cubes = []
+
+    for num in range(1, num_cubes + 1):
+        cube = generate_cube_from_dates('monthly', '360_day', len_data=num)
+        cubes.append(cube)
+
+    with pytest.raises(iris.exceptions.MergeError):
+        _ = mm._combine(cubes)
+
+
+def test_combine_inconsistent_var_names_fail():
+    """Test _combine with inconsistent var names."""
+    num_cubes = 5
+    cubes = []
+
+    for num in range(num_cubes):
+        cube = generate_cube_from_dates('monthly',
+                                        '360_day',
+                                        var_name=f'test_var_{num}')
+        cubes.append(cube)
+
+    with pytest.raises(iris.exceptions.MergeError):
+        _ = mm._combine(cubes)
+
+
 @pytest.mark.parametrize('span', SPAN_OPTIONS)
 def test_edge_case_different_time_offsets(span):
     cubes = (
@@ -219,10 +376,6 @@ def test_edge_case_different_time_offsets(span):
     desired = np.array((14., 45., 73.))
     np.testing.assert_array_equal(time_coord.points, desired)
 
-    # input cubes are updated in-place
-    for cube in cubes:
-        np.testing.assert_array_equal(cube.coord('time').points, desired)
-
 
 def generate_cubes_with_non_overlapping_timecoords():
     """Generate sample data where time coords do not overlap."""
@@ -296,10 +449,6 @@ def test_edge_case_time_not_in_middle_of_months(span):
     desired = np.array((14., 45., 73.))
     np.testing.assert_array_equal(time_coord.points, desired)
 
-    # input cubes are updated in-place
-    for cube in cubes:
-        np.testing.assert_array_equal(cube.coord('time').points, desired)
-
 
 @pytest.mark.parametrize('span', SPAN_OPTIONS)
 def test_edge_case_sub_daily_data_fail(span):
@@ -314,6 +463,19 @@ def test_edge_case_sub_daily_data_fail(span):
         _ = multi_model_statistics(cubes, span, statistics)
 
 
+@pytest.mark.parametrize('span', SPAN_OPTIONS)
+def test_edge_case_single_cube_fail(span):
+    """Test that an error is raised when a single cube is passed."""
+    cube = generate_cube_from_dates('monthly')
+    cubes = (cube, )
+
+    statistic = 'min'
+    statistics = (statistic, )
+
+    with pytest.raises(ValueError):
+        _ = multi_model_statistics(cubes, span, statistics)
+
+
 def test_unify_time_coordinates():
     """Test set common calendar."""
     cube1 = generate_cube_from_dates('monthly',
@@ -360,6 +522,7 @@ def test_return_products():
     result1 = mm._multiproduct_statistics(products,
                                           keep_input_datasets=True,
                                           **kwargs)
+
     result2 = mm._multiproduct_statistics(products,
                                           keep_input_datasets=False,
                                           **kwargs)