From 20393cc4a6569080548bf484df93f9b989beceab Mon Sep 17 00:00:00 2001 From: stephenworsley <49274989+stephenworsley@users.noreply.github.com> Date: Mon, 20 Nov 2023 16:54:42 +0000 Subject: [PATCH] Refactor area weighted regridding, improve performance (#5543) * refactor area weighted regridding * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix test failures * remove old code, fix tests * fix tests, make masking more robust * fix tests, maintain dtype behaviour * fix tests, remove old code * fix tests * fix out of bounds and circularity handling * fix test * add test * add documentation, avoid unnecessary regrid calls * remove unnecessary code, improve coverage * add docstrings * minor fixes * change dimension ordering to match curvilinear regridding * make x-y ordering more consistent with existing implementations * add documentation, tidy code * add documentation, reset test * add documentation * address review comments --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- lib/iris/analysis/_area_weighted.py | 1041 +++++------------ ..._area_weighted_rectilinear_src_and_grid.py | 58 + .../test_AreaWeightedRegridder.py | 2 +- 3 files changed, 364 insertions(+), 737 deletions(-) diff --git a/lib/iris/analysis/_area_weighted.py b/lib/iris/analysis/_area_weighted.py index ffec82fd4e..bd2ad90a3a 100644 --- a/lib/iris/analysis/_area_weighted.py +++ b/lib/iris/analysis/_area_weighted.py @@ -7,6 +7,7 @@ import cf_units import numpy as np import numpy.ma as ma +from scipy.sparse import csr_array from iris._lazy_data import map_complete_blocks from iris.analysis._interpolation import get_xy_dim_coords, snapshot_grid @@ -75,8 +76,7 @@ def __init__(self, src_grid_cube, target_grid_cube, mdtol=1): self.grid_y, self.meshgrid_x, self.meshgrid_y, - self.weights_info, - self.index_info, + self.weights, ) = _regrid_info def __call__(self, cube): @@ -125,8 +125,7 @@ def __call__(self, cube): self.grid_y, self.meshgrid_x, self.meshgrid_y, - self.weights_info, - self.index_info, + self.weights, ) return _regrid_area_weighted_rectilinear_src_and_grid__perform( cube, _regrid_info, mdtol=self._mdtol @@ -224,468 +223,17 @@ def _get_xy_coords(cube): return x_coord, y_coord -def _within_bounds(src_bounds, tgt_bounds, orderswap=False): - """ - Determine which target bounds lie within the extremes of the source bounds. - - Args: - - * src_bounds (ndarray): - An (n, 2) shaped array of monotonic contiguous source bounds. - * tgt_bounds (ndarray): - An (n, 2) shaped array corresponding to the target bounds. - - Kwargs: - - * orderswap (bool): - A Boolean indicating whether the target bounds are in descending order - (True). Defaults to False. - - Returns: - Boolean ndarray, indicating whether each target bound is within the - extremes of the source bounds. - - """ - min_bound = np.min(src_bounds) - 1e-14 - max_bound = np.max(src_bounds) + 1e-14 - - # Swap upper-lower is necessary. - if orderswap is True: - upper, lower = tgt_bounds.T - else: - lower, upper = tgt_bounds.T - - return ((lower <= max_bound) * (lower >= min_bound)) * ( - (upper <= max_bound) * (upper >= min_bound) - ) - - -def _cropped_bounds(bounds, lower, upper): - """ - Return a new bounds array and corresponding slice object (or indices) of - the original data array, resulting from cropping the provided bounds - between the specified lower and upper values. The bounds at the - extremities will be truncated so that they start and end with lower and - upper. - - This function will return an empty NumPy array and slice if there is no - overlap between the region covered by bounds and the region from lower to - upper. - - If lower > upper the resulting bounds may not be contiguous and the - indices object will be a tuple of indices rather than a slice object. - - Args: - - * bounds: - An (n, 2) shaped array of monotonic contiguous bounds. - * lower: - Lower bound at which to crop the bounds array. - * upper: - Upper bound at which to crop the bounds array. - - Returns: - A tuple of the new bounds array and the corresponding slice object or - indices from the zeroth axis of the original array. - - """ - reversed_flag = False - # Ensure order is increasing. - if bounds[0, 0] > bounds[-1, 0]: - # Reverse bounds - bounds = bounds[::-1, ::-1] - reversed_flag = True - - # Number of bounds. - n = bounds.shape[0] - - if lower <= upper: - if lower > bounds[-1, 1] or upper < bounds[0, 0]: - new_bounds = bounds[0:0] - indices = slice(0, 0) - else: - # A single region lower->upper. - if lower < bounds[0, 0]: - # Region extends below bounds so use first lower bound. - lindex = 0 - lower = bounds[0, 0] - else: - # Index of last lower bound less than or equal to lower. - lindex = np.nonzero(bounds[:, 0] <= lower)[0][-1] - if upper > bounds[-1, 1]: - # Region extends above bounds so use last upper bound. - uindex = n - 1 - upper = bounds[-1, 1] - else: - # Index of first upper bound greater than or equal to - # upper. - uindex = np.nonzero(bounds[:, 1] >= upper)[0][0] - # Extract the bounds in our region defined by lower->upper. - new_bounds = np.copy(bounds[lindex : (uindex + 1), :]) - # Replace first and last values with specified bounds. - new_bounds[0, 0] = lower - new_bounds[-1, 1] = upper - if reversed_flag: - indices = slice(n - (uindex + 1), n - lindex) - else: - indices = slice(lindex, uindex + 1) - else: - # Two regions [0]->upper, lower->[-1] - # [0]->upper - if upper < bounds[0, 0]: - # Region outside src bounds. - new_bounds_left = bounds[0:0] - indices_left = tuple() - slice_left = slice(0, 0) - else: - if upper > bounds[-1, 1]: - # Whole of bounds. - uindex = n - 1 - upper = bounds[-1, 1] - else: - # Index of first upper bound greater than or equal to upper. - uindex = np.nonzero(bounds[:, 1] >= upper)[0][0] - # Extract the bounds in our region defined by [0]->upper. - new_bounds_left = np.copy(bounds[0 : (uindex + 1), :]) - # Replace last value with specified bound. - new_bounds_left[-1, 1] = upper - if reversed_flag: - indices_left = tuple(range(n - (uindex + 1), n)) - slice_left = slice(n - (uindex + 1), n) - else: - indices_left = tuple(range(0, uindex + 1)) - slice_left = slice(0, uindex + 1) - # lower->[-1] - if lower > bounds[-1, 1]: - # Region is outside src bounds. - new_bounds_right = bounds[0:0] - indices_right = tuple() - slice_right = slice(0, 0) - else: - if lower < bounds[0, 0]: - # Whole of bounds. - lindex = 0 - lower = bounds[0, 0] - else: - # Index of last lower bound less than or equal to lower. - lindex = np.nonzero(bounds[:, 0] <= lower)[0][-1] - # Extract the bounds in our region defined by lower->[-1]. - new_bounds_right = np.copy(bounds[lindex:, :]) - # Replace first value with specified bound. - new_bounds_right[0, 0] = lower - if reversed_flag: - indices_right = tuple(range(0, n - lindex)) - slice_right = slice(0, n - lindex) - else: - indices_right = tuple(range(lindex, n)) - slice_right = slice(lindex, None) - - if reversed_flag: - # Flip everything around. - indices_left, indices_right = indices_right, indices_left - slice_left, slice_right = slice_right, slice_left - - # Combine regions. - new_bounds = np.concatenate((new_bounds_left, new_bounds_right)) - # Use slices if possible, but if we have two regions use indices. - if indices_left and indices_right: - indices = indices_left + indices_right - elif indices_left: - indices = slice_left - elif indices_right: - indices = slice_right - else: - indices = slice(0, 0) - - if reversed_flag: - new_bounds = new_bounds[::-1, ::-1] - - return new_bounds, indices - - -def _cartesian_area(y_bounds, x_bounds): - """ - Return an array of the areas of each cell given two arrays - of cartesian bounds. - - Args: - - * y_bounds: - An (n, 2) shaped NumPy array. - * x_bounds: - An (m, 2) shaped NumPy array. - - Returns: - An (n, m) shaped Numpy array of areas. - - """ - heights = y_bounds[:, 1] - y_bounds[:, 0] - widths = x_bounds[:, 1] - x_bounds[:, 0] - return np.abs(np.outer(heights, widths)) - - -def _spherical_area(y_bounds, x_bounds, radius=1.0): +def _get_bounds_in_units(coord, units, dtype): """ - Return an array of the areas of each cell on a sphere - given two arrays of latitude and longitude bounds in radians. - - Args: - - * y_bounds: - An (n, 2) shaped NumPy array of latitude bounds in radians. - * x_bounds: - An (m, 2) shaped NumPy array of longitude bounds in radians. - * radius: - Radius of the sphere. Default is 1.0. - - Returns: - An (n, m) shaped Numpy array of areas. + Return a copy of coord's bounds in the specified units and dtype. + Return as contiguous bounds. """ - return iris.analysis.cartography._quadrant_area(y_bounds, x_bounds, radius) - - -def _get_bounds_in_units(coord, units, dtype): - """Return a copy of coord's bounds in the specified units and dtype.""" # The bounds are cast to dtype before conversion to prevent issues when # mixing float32 and float64 types. - return coord.units.convert(coord.bounds.astype(dtype), units).astype(dtype) - - -def _weighted_mean_with_mdtol(data, weights, axis=None, mdtol=0): - """ - Return the weighted mean of an array over the specified axis - using the provided weights (if any) and a permitted fraction of - masked data. - - Args: - - * data (array-like): - Data to be averaged. - - * weights (array-like): - An array of the same shape as the data that specifies the contribution - of each corresponding data element to the calculated mean. - - Kwargs: - - * axis (int or tuple of ints): - Axis along which the mean is computed. The default is to compute - the mean of the flattened array. - - * mdtol (float): - Tolerance of missing data. The value returned in each element of the - returned array will be masked if the fraction of masked data exceeds - mdtol. This fraction is weighted by the `weights` array if one is - provided. mdtol=0 means no missing data is tolerated - while mdtol=1 will mean the resulting element will be masked if and - only if all the contributing elements of data are masked. - Defaults to 0. - - Returns: - Numpy array (possibly masked) or scalar. - - """ - if ma.is_masked(data): - res, unmasked_weights_sum = ma.average( - data, weights=weights, axis=axis, returned=True - ) - if mdtol < 1: - weights_sum = weights.sum(axis=axis) - frac_masked = 1 - np.true_divide(unmasked_weights_sum, weights_sum) - mask_pt = frac_masked > mdtol - if np.any(mask_pt) and not isinstance(res, ma.core.MaskedConstant): - if np.isscalar(res): - res = ma.masked - elif ma.isMaskedArray(res): - res.mask |= mask_pt - else: - res = ma.masked_array(res, mask=mask_pt) - else: - res = np.average(data, weights=weights, axis=axis) - return res - - -def _regrid_area_weighted_array( - src_data, x_dim, y_dim, weights_info, index_info, mdtol=0 -): - """ - Regrid the given data from its source grid to a new grid using - an area weighted mean to determine the resulting data values. - - .. note:: - - Elements in the returned array that lie either partially - or entirely outside of the extent of the source grid will - be masked irrespective of the value of mdtol. - - Args: - - * src_data: - An N-dimensional NumPy array. - * x_dim: - The X dimension within `src_data`. - * y_dim: - The Y dimension within `src_data`. - * weights_info: - The area weights information to be used for area-weighted - regridding. - - Kwargs: - - * mdtol: - Tolerance of missing data. The value returned in each element of the - returned array will be masked if the fraction of missing data exceeds - mdtol. This fraction is calculated based on the area of masked cells - within each target cell. mdtol=0 means no missing data is tolerated - while mdtol=1 will mean the resulting element will be masked if and - only if all the overlapping elements of the source grid are masked. - Defaults to 0. - - Returns: - The regridded data as an N-dimensional NumPy array. The lengths - of the X and Y dimensions will now match those of the target - grid. - - """ - ( - blank_weights, - src_area_weights, - new_data_mask_basis, - ) = weights_info - - ( - result_x_extent, - result_y_extent, - square_data_indices_y, - square_data_indices_x, - src_area_datas_required, - ) = index_info - - # Ensure we have x_dim and y_dim. - x_dim_orig = x_dim - y_dim_orig = y_dim - if y_dim is None: - src_data = np.expand_dims(src_data, axis=src_data.ndim) - y_dim = src_data.ndim - 1 - if x_dim is None: - src_data = np.expand_dims(src_data, axis=src_data.ndim) - x_dim = src_data.ndim - 1 - # Move y_dim and x_dim to last dimensions - if not x_dim == src_data.ndim - 1: - src_data = np.moveaxis(src_data, x_dim, -1) - if not y_dim == src_data.ndim - 2: - if x_dim < y_dim: - # note: y_dim was shifted along by one position when - # x_dim was moved to the last dimension - src_data = np.moveaxis(src_data, y_dim - 1, -2) - elif x_dim > y_dim: - src_data = np.moveaxis(src_data, y_dim, -2) - x_dim = src_data.ndim - 1 - y_dim = src_data.ndim - 2 - - # Create empty "pre-averaging" data array that will enable the - # src_data data corresponding to a given target grid point, - # to be stacked per point. - # Note that dtype is not preserved and that the array mask - # allows for regions that do not overlap. - new_shape = list(src_data.shape) - new_shape[x_dim] = result_x_extent - new_shape[y_dim] = result_y_extent - - # Use input cube dtype or convert values to the smallest possible float - # dtype when necessary. - dtype = np.promote_types(src_data.dtype, np.float16) - - # Axes of data over which the weighted mean is calculated. - axis = (y_dim, x_dim) - - # Use previously established indices - - src_area_datas_square = src_data[ - ..., square_data_indices_y, square_data_indices_x - ] - - _, src_area_datas_required = np.broadcast_arrays( - src_area_datas_square, src_area_datas_required - ) - - src_area_datas = np.where( - src_area_datas_required, src_area_datas_square, 0 - ) - - # Flag to indicate whether the original data was a masked array. - src_masked = src_data.mask.any() if ma.isMaskedArray(src_data) else False - if src_masked: - src_area_masks_square = src_data.mask[ - ..., square_data_indices_y, square_data_indices_x - ] - src_area_masks = np.where( - src_area_datas_required, src_area_masks_square, True - ) - - else: - # If the weights were originally blank, set the weights to all 1 to - # avoid divide by 0 error and set the new data mask for making the - # values 0 - src_area_weights = np.where(blank_weights, 1, src_area_weights) - - new_data_mask = np.broadcast_to(new_data_mask_basis, new_shape) - - # Broadcast the weights array to allow numpy's ma.average - # to be called. - # Assign new shape to raise error on copy. - src_area_weights.shape = src_area_datas.shape[-3:] - # Broadcast weights to match shape of data. - _, src_area_weights = np.broadcast_arrays(src_area_datas, src_area_weights) - - # Mask the data points - if src_masked: - src_area_datas = np.ma.array(src_area_datas, mask=src_area_masks) - - # Calculate weighted mean taking into account missing data. - new_data = _weighted_mean_with_mdtol( - src_area_datas, weights=src_area_weights, axis=axis, mdtol=mdtol - ) - new_data = new_data.reshape(new_shape) - if src_masked: - new_data_mask = new_data.mask - - # Mask the data if originally masked or if the result has masked points - if ma.isMaskedArray(src_data): - new_data = ma.array( - new_data, - mask=new_data_mask, - fill_value=src_data.fill_value, - dtype=dtype, - ) - elif new_data_mask.any(): - new_data = ma.array(new_data, mask=new_data_mask, dtype=dtype) - else: - new_data = new_data.astype(dtype) - - # Restore data to original form - if x_dim_orig is None and y_dim_orig is None: - new_data = np.squeeze(new_data, axis=x_dim) - new_data = np.squeeze(new_data, axis=y_dim) - elif y_dim_orig is None: - new_data = np.squeeze(new_data, axis=y_dim) - new_data = np.moveaxis(new_data, -1, x_dim_orig) - elif x_dim_orig is None: - new_data = np.squeeze(new_data, axis=x_dim) - new_data = np.moveaxis(new_data, -1, y_dim_orig) - elif x_dim_orig < y_dim_orig: - # move the x_dim back first, so that the y_dim will - # then be moved to its original position - new_data = np.moveaxis(new_data, -1, x_dim_orig) - new_data = np.moveaxis(new_data, -1, y_dim_orig) - else: - # move the y_dim back first, so that the x_dim will - # then be moved to its original position - new_data = np.moveaxis(new_data, -2, y_dim_orig) - new_data = np.moveaxis(new_data, -1, x_dim_orig) - - return new_data + return coord.units.convert( + coord.contiguous_bounds().astype(dtype), units + ).astype(dtype) def _regrid_area_weighted_rectilinear_src_and_grid__prepare( @@ -775,290 +323,51 @@ def _regrid_area_weighted_rectilinear_src_and_grid__prepare( # Create 2d meshgrids as required by _create_cube func. meshgrid_x, meshgrid_y = _meshgrid(grid_x.points, grid_y.points) - # Determine whether target grid bounds are decreasing. This must - # be determined prior to wrap_lons being called. - grid_x_decreasing = grid_x_bounds[-1, 0] < grid_x_bounds[0, 0] - grid_y_decreasing = grid_y_bounds[-1, 0] < grid_y_bounds[0, 0] - # Wrapping of longitudes. if spherical: - base = np.min(src_x_bounds) modulus = x_units.modulus - # Only wrap if necessary to avoid introducing floating - # point errors. - if np.min(grid_x_bounds) < base or np.max(grid_x_bounds) > ( - base + modulus - ): - grid_x_bounds = iris.analysis.cartography.wrap_lons( - grid_x_bounds, base, modulus - ) - - # Determine whether the src_x coord has periodic boundary conditions. - circular = getattr(src_x, "circular", False) - - # Use simple cartesian area function or one that takes into - # account the curved surface if coord system is spherical. - if spherical: - area_func = _spherical_area else: - area_func = _cartesian_area + modulus = None def _calculate_regrid_area_weighted_weights( src_x_bounds, src_y_bounds, grid_x_bounds, grid_y_bounds, - grid_x_decreasing, - grid_y_decreasing, - area_func, - circular=False, + spherical, + modulus=None, ): - """ - Compute the area weights used for area-weighted regridding. - Args: - * src_x_bounds: - A NumPy array of bounds along the X axis defining the source grid. - * src_y_bounds: - A NumPy array of bounds along the Y axis defining the source grid. - * grid_x_bounds: - A NumPy array of bounds along the X axis defining the new grid. - * grid_y_bounds: - A NumPy array of bounds along the Y axis defining the new grid. - * grid_x_decreasing: - Boolean indicating whether the X coordinate of the new grid is - in descending order. - * grid_y_decreasing: - Boolean indicating whether the Y coordinate of the new grid is - in descending order. - * area_func: - A function that returns an (p, q) array of weights given an (p, 2) - shaped array of Y bounds and an (q, 2) shaped array of X bounds. - Kwargs: - * circular: - A boolean indicating whether the `src_x_bounds` are periodic. - Default is False. - Returns: - The area weights to be used for area-weighted regridding. - """ - # Determine which grid bounds are within src extent. - y_within_bounds = _within_bounds( - src_y_bounds, grid_y_bounds, grid_y_decreasing - ) - x_within_bounds = _within_bounds( - src_x_bounds, grid_x_bounds, grid_x_decreasing + """Return weights matrix to be used in regridding.""" + src_shape = (len(src_x_bounds) - 1, len(src_y_bounds) - 1) + tgt_shape = (len(grid_x_bounds) - 1, len(grid_y_bounds) - 1) + + if spherical: + # Changing the dtype here replicates old regridding behaviour. + dtype = np.float64 + src_x_bounds = src_x_bounds.astype(dtype) + src_y_bounds = src_y_bounds.astype(dtype) + grid_x_bounds = grid_x_bounds.astype(dtype) + grid_y_bounds = grid_y_bounds.astype(dtype) + + src_y_bounds = np.sin(src_y_bounds) + grid_y_bounds = np.sin(grid_y_bounds) + x_info = _get_coord_to_coord_matrix_info( + src_x_bounds, grid_x_bounds, circular=spherical, mod=modulus ) - - # Cache which src_bounds are within grid bounds - cached_x_bounds = [] - cached_x_indices = [] - max_x_indices = 0 - for x_0, x_1 in grid_x_bounds: - if grid_x_decreasing: - x_0, x_1 = x_1, x_0 - x_bounds, x_indices = _cropped_bounds(src_x_bounds, x_0, x_1) - cached_x_bounds.append(x_bounds) - cached_x_indices.append(x_indices) - # Keep record of the largest slice - if isinstance(x_indices, slice): - x_indices_size = np.sum(x_indices.stop - x_indices.start) - else: # is tuple of indices - x_indices_size = len(x_indices) - if x_indices_size > max_x_indices: - max_x_indices = x_indices_size - - # Cache which y src_bounds areas and weights are within grid bounds - cached_y_indices = [] - cached_weights = [] - max_y_indices = 0 - for j, (y_0, y_1) in enumerate(grid_y_bounds): - # Reverse lower and upper if dest grid is decreasing. - if grid_y_decreasing: - y_0, y_1 = y_1, y_0 - y_bounds, y_indices = _cropped_bounds(src_y_bounds, y_0, y_1) - cached_y_indices.append(y_indices) - # Keep record of the largest slice - if isinstance(y_indices, slice): - y_indices_size = np.sum(y_indices.stop - y_indices.start) - else: # is tuple of indices - y_indices_size = len(y_indices) - if y_indices_size > max_y_indices: - max_y_indices = y_indices_size - - weights_i = [] - for i, (x_0, x_1) in enumerate(grid_x_bounds): - # Reverse lower and upper if dest grid is decreasing. - if grid_x_decreasing: - x_0, x_1 = x_1, x_0 - x_bounds = cached_x_bounds[i] - x_indices = cached_x_indices[i] - - # Determine whether element i, j overlaps with src and hence - # an area weight should be computed. - # If x_0 > x_1 then we want [0]->x_1 and x_0->[0] + mod in the case - # of wrapped longitudes. However if the src grid is not global - # (i.e. circular) this new cell would include a region outside of - # the extent of the src grid and thus the weight is therefore - # invalid. - outside_extent = x_0 > x_1 and not circular - if ( - outside_extent - or not y_within_bounds[j] - or not x_within_bounds[i] - ): - weights = False - else: - # Calculate weights based on areas of cropped bounds. - if isinstance(x_indices, tuple) and isinstance( - y_indices, tuple - ): - raise RuntimeError( - "Cannot handle split bounds " "in both x and y." - ) - weights = area_func(y_bounds, x_bounds) - weights_i.append(weights) - cached_weights.append(weights_i) - return ( - tuple(cached_x_indices), - tuple(cached_y_indices), - max_x_indices, - max_y_indices, - tuple(cached_weights), + y_info = _get_coord_to_coord_matrix_info(src_y_bounds, grid_y_bounds) + weights_matrix = _combine_xy_weights( + x_info, y_info, src_shape, tgt_shape ) + return weights_matrix - ( - cached_x_indices, - cached_y_indices, - max_x_indices, - max_y_indices, - cached_weights, - ) = _calculate_regrid_area_weighted_weights( + weights = _calculate_regrid_area_weighted_weights( src_x_bounds, src_y_bounds, grid_x_bounds, grid_y_bounds, - grid_x_decreasing, - grid_y_decreasing, - area_func, - circular, - ) - - # Go further, calculating the full weights array that we'll need in the - # perform step and the indices we'll need to extract from the cube we're - # regridding (src_data) - - result_y_extent = len(grid_y_bounds) - result_x_extent = len(grid_x_bounds) - - # Total number of points - num_target_pts = result_y_extent * result_x_extent - - # Create empty array to hold weights - src_area_weights = np.zeros( - list((max_y_indices, max_x_indices, num_target_pts)) + spherical, + modulus, ) - - # Built for the case where the source cube isn't masked - blank_weights = np.zeros((num_target_pts,)) - new_data_mask_basis = np.full( - (len(cached_y_indices), len(cached_x_indices)), False, dtype=np.bool_ - ) - - # To permit fancy indexing, we need to store our data in an array whose - # first two dimensions represent the indices needed for the target cell. - # Since target cells can require a different number of indices, the size of - # these dimensions should be the maximum of this number. - # This means we need to track whether the data in - # that array is actually required and build those squared-off arrays - # TODO: Consider if a proper mask would be better - src_area_datas_required = np.full( - (max_y_indices, max_x_indices, num_target_pts), False - ) - square_data_indices_y = np.zeros( - (max_y_indices, max_x_indices, num_target_pts), dtype=int - ) - square_data_indices_x = np.zeros( - (max_y_indices, max_x_indices, num_target_pts), dtype=int - ) - - # Stack the weights for each target point and build the indices we'll need - # to extract the src_area_data - target_pt_ji = -1 - for j, y_indices in enumerate(cached_y_indices): - for i, x_indices in enumerate(cached_x_indices): - target_pt_ji += 1 - # Determine whether to mask element i, j based on whether - # there are valid weights. - weights = cached_weights[j][i] - if weights is False: - # Prepare for the src_data not being masked by storing the - # information that will let us fill the data with zeros and - # weights as one. The weighted average result will be the same, - # but we avoid dividing by zero. - blank_weights[target_pt_ji] = True - new_data_mask_basis[j, i] = True - else: - # Establish which indices are actually in y_indices and x_indices - if isinstance(y_indices, slice): - y_indices = list( - range( - y_indices.start, - y_indices.stop, - y_indices.step or 1, - ) - ) - else: - y_indices = list(y_indices) - - if isinstance(x_indices, slice): - x_indices = list( - range( - x_indices.start, - x_indices.stop, - x_indices.step or 1, - ) - ) - else: - x_indices = list(x_indices) - - # For the weights, we just need the lengths of these as we're - # dropping them into a pre-made array - - len_y = len(y_indices) - len_x = len(x_indices) - - src_area_weights[0:len_y, 0:len_x, target_pt_ji] = weights - - # To build the indices for the source cube, we need equal - # shaped array so we pad with 0s and record the need to mask - # them in src_area_datas_required - padded_y_indices = y_indices + [0] * (max_y_indices - len_y) - padded_x_indices = x_indices + [0] * (max_x_indices - len_x) - - square_data_indices_y[..., target_pt_ji] = np.array( - padded_y_indices - )[:, np.newaxis] - square_data_indices_x[..., target_pt_ji] = padded_x_indices - - src_area_datas_required[0:len_y, 0:len_x, target_pt_ji] = True - - # Package up the return data - - weights_info = ( - blank_weights, - src_area_weights, - new_data_mask_basis, - ) - - index_info = ( - result_x_extent, - result_y_extent, - square_data_indices_y, - square_data_indices_x, - src_area_datas_required, - ) - - # Now return it - return ( src_x, src_y, @@ -1068,8 +377,7 @@ def _calculate_regrid_area_weighted_weights( grid_y, meshgrid_x, meshgrid_y, - weights_info, - index_info, + weights, ) @@ -1091,17 +399,18 @@ def _regrid_area_weighted_rectilinear_src_and_grid__perform( grid_y, meshgrid_x, meshgrid_y, - weights_info, - index_info, + weights, ) = regrid_info + tgt_shape = (len(grid_y.points), len(grid_x.points)) + # Calculate new data array for regridded cube. regrid = functools.partial( - _regrid_area_weighted_array, + _regrid_along_dims, x_dim=src_x_dim, y_dim=src_y_dim, - weights_info=weights_info, - index_info=index_info, + weights=weights, + tgt_shape=tgt_shape, mdtol=mdtol, ) @@ -1120,9 +429,9 @@ def _regrid_area_weighted_rectilinear_src_and_grid__perform( ) # TODO: investigate if an area weighted callback would be more appropriate. # _regrid_callback = functools.partial( - # _regrid_area_weighted_array, - # weights_info=weights_info, - # index_info=index_info, + # _regrid_along_dims, + # weights=weights, + # tgt_shape=tgt_shape, # mdtol=mdtol, # ) @@ -1149,3 +458,263 @@ def regrid_callback(*args, **kwargs): new_cube = new_cube[tuple(indices)] return new_cube + + +def _get_coord_to_coord_matrix_info( + src_bounds, tgt_bounds, circular=False, mod=None +): + """ + First part of weight calculation. + + Calculate the weights contribution from a single pair of + coordinate bounds. Search for pairs of overlapping source and + target bounds and associate weights with them. + + Note: this assumes that the bounds are monotonic. + """ + # Calculate the number of cells represented by the bounds. + m = len(tgt_bounds) - 1 + n = len(src_bounds) - 1 + + # Ensure bounds are strictly increasing. + src_decreasing = src_bounds[0] > src_bounds[1] + tgt_decreasing = tgt_bounds[0] > tgt_bounds[1] + if src_decreasing: + src_bounds = src_bounds[::-1] + if tgt_decreasing: + tgt_bounds = tgt_bounds[::-1] + + if circular: + # For circular coordinates (e.g. longitude) account for source and + # target bounds which span different ranges (e.g. (-180, 180) vs + # (0, 360)). We ensure that all possible overlaps between source and + # target bounds are accounted for by including two copies of the + # source bounds, shifted appropriately by the modulus. + adjust = (tgt_bounds.min() - src_bounds.min()) // mod + src_bounds = src_bounds + (mod * adjust) + src_bounds = np.append(src_bounds, src_bounds + mod) + nn = (2 * n) + 1 + else: + nn = n + + # Before iterating through pairs of overlapping bounds, find an + # appropriate place to start iteration. Note that this assumes that + # the bounds are increasing. + i = max(np.searchsorted(tgt_bounds, src_bounds[0], side="right") - 1, 0) + j = max(np.searchsorted(src_bounds, tgt_bounds[0], side="right") - 1, 0) + + data = [] + rows = [] + cols = [] + + # Iterate through overlapping cells in the source and target bounds. + # For the sake of calculations, we keep track of the minimum value of + # the intersection of each cell. + floor = max(tgt_bounds[i], src_bounds[j]) + while i < m and j < nn: + # Record the current indices. + rows.append(i) + cols.append(j) + + # Determine the next indices and floor. + if tgt_bounds[i + 1] < src_bounds[j + 1]: + next_floor = tgt_bounds[i + 1] + next_i = i + 1 + elif tgt_bounds[i + 1] == src_bounds[j + 1]: + next_floor = tgt_bounds[i + 1] + next_i = i + 1 + j += 1 + else: + next_floor = src_bounds[j + 1] + next_i = i + j += 1 + + # Calculate and record the weight for the current overlapping cells. + weight = (next_floor - floor) / (tgt_bounds[i + 1] - tgt_bounds[i]) + data.append(weight) + + # Update indices and floor + i = next_i + floor = next_floor + + data = np.array(data) + rows = np.array(rows) + cols = np.array(cols) + + if circular: + # Remove out of bounds points. When the source bounds were duplicated + # an "out of bounds" cell was introduced between the two copies. + oob = np.where(cols == n) + data = np.delete(data, oob) + rows = np.delete(rows, oob) + cols = np.delete(cols, oob) + + # Wrap indices. Since we duplicated the source bounds there may be + # indices which are greater than n which will need to be corrected. + cols = cols % (n + 1) + + # Correct indices which were flipped due to reversing decreasing bounds. + if src_decreasing: + cols = n - cols - 1 + if tgt_decreasing: + rows = m - rows - 1 + + return data, rows, cols + + +def _combine_xy_weights(x_info, y_info, src_shape, tgt_shape): + """ + Second part of weight calculation. + + Combine the weights contributions from both pairs of coordinate + bounds (i.e. the source/target pairs for the x and y coords). + Return the result as a sparse array. + """ + x_src, y_src = src_shape + x_tgt, y_tgt = tgt_shape + src_size = x_src * y_src + tgt_size = x_tgt * y_tgt + x_weight, x_rows, x_cols = x_info + y_weight, y_rows, y_cols = y_info + + # Regridding weights will be applied to a flattened (y, x) array. + # Weights and indices are constructed in a way to account for this. + # Weights of the combined matrix are constructed by broadcasting + # the x_weights and y_weights. The resulting array contains every + # combination of x weight and y weight. Then we flatten this array. + xy_weight = y_weight[:, np.newaxis] * x_weight[np.newaxis, :] + xy_weight = xy_weight.flatten() + + # Given the x index and y index associated with a weight, calculate + # the equivalent index in the flattened (y, x) array. + xy_rows = (y_rows[:, np.newaxis] * x_tgt) + x_rows[np.newaxis, :] + xy_rows = xy_rows.flatten() + xy_cols = (y_cols[:, np.newaxis] * x_src) + x_cols[np.newaxis, :] + xy_cols = xy_cols.flatten() + + # Create a sparse matrix for efficient weight application. + combined_weights = csr_array( + (xy_weight, (xy_rows, xy_cols)), shape=(tgt_size, src_size) + ) + return combined_weights + + +def _standard_regrid_no_masks(data, weights, tgt_shape): + """ + Regrid unmasked data to an unmasked result. + + Assumes that the first two dimensions are the x-y grid. + """ + # Reshape data to a form suitable for matrix multiplication. + extra_shape = data.shape[:-2] + data = data.reshape(-1, np.prod(data.shape[-2:])) + + # Apply regridding weights. + # The order of matrix multiplication is chosen to be consistent + # with existing regridding code. + result = data @ weights.T + + # Reshape result to a suitable form. + result = result.reshape(*(extra_shape + tgt_shape)) + return result + + +def _standard_regrid(data, weights, tgt_shape, mdtol): + """ + Regrid data and handle masks. + + Assumes that the first two dimensions are the x-y grid. + """ + # This is set to keep consistent with legacy behaviour. + # This is likely to become switchable in the future, see: + # https://github.com/SciTools/iris/issues/5461 + oob_invalid = True + + data_shape = data.shape + if ma.is_masked(data): + unmasked = ~ma.getmaskarray(data) + # Calculate contribution from unmasked sources to each target point. + weight_sums = _standard_regrid_no_masks(unmasked, weights, tgt_shape) + else: + # If there are no masked points then all contributions will be + # from unmasked sources, so we can skip this calculation + weight_sums = np.ones(data_shape[:-2] + tgt_shape) + mdtol = max(mdtol, 1e-8) + tgt_mask = weight_sums > 1 - mdtol + # If out of bounds sources are treated the same as masked sources this + # will already have been calculated above, so we can skip this calculation. + if oob_invalid or not ma.is_masked(data): + # Calculate the proportion of each target cell which is covered by the + # source. For the sake of efficiency, this is calculated for a 2D slice + # which is then broadcast. + inbound_sums = _standard_regrid_no_masks( + np.ones(data_shape[-2:]), weights, tgt_shape + ) + if oob_invalid: + # Legacy behaviour, if the full area of a target cell does not lie + # in bounds it will be masked. + oob_mask = inbound_sums > 1 - 1e-8 + else: + # Note: this code is currently inaccessible. This code exists to lay + # the groundwork for future work which will make out of bounds + # behaviour switchable. + oob_mask = inbound_sums > 1 - mdtol + # Broadcast the mask to the shape of the full array + oob_slice = ((np.newaxis,) * len(data.shape[:-2])) + np.s_[:, :] + tgt_mask = tgt_mask * oob_mask[oob_slice] + + # Calculate normalisations. + normalisations = tgt_mask.astype(weight_sums.dtype) + normalisations[tgt_mask] /= weight_sums[tgt_mask] + + # Mask points in the result. + if ma.isMaskedArray(data): + # If the source is masked, the result should have a similar mask. + fill_value = data.fill_value + normalisations = ma.array( + normalisations, mask=~tgt_mask, fill_value=fill_value + ) + elif np.any(~tgt_mask): + normalisations = ma.array(normalisations, mask=~tgt_mask) + + # Use input cube dtype or convert values to the smallest possible float + # dtype when necessary. + dtype = np.promote_types(data.dtype, np.float16) + + # Perform regridding on unmasked data. + result = _standard_regrid_no_masks( + ma.filled(data, 0.0), weights, tgt_shape + ) + # Apply normalisations and masks to the regridded data. + result = result * normalisations + result = result.astype(dtype) + return result + + +def _regrid_along_dims(data, x_dim, y_dim, weights, tgt_shape, mdtol): + """Regrid data, handling masks and dimensions.""" + # Handle scalar coordinates. + # Note: scalar source coordinates are only handled when their + # corresponding target coordinate is also scalar. + num_scalar_dims = 0 + if x_dim is None: + num_scalar_dims += 1 + data = np.expand_dims(data, -1) + x_dim = -1 + if y_dim is None: + num_scalar_dims += 1 + data = np.expand_dims(data, -1) + y_dim = -1 + if num_scalar_dims == 2: + y_dim = -2 + + # Standard regridding expects the last two dimensions to belong + # to the y and x coordinate and will output as such. + # Axes are moved to account for an arbitrary dimension ordering. + data = np.moveaxis(data, [y_dim, x_dim], [-2, -1]) + result = _standard_regrid(data, weights, tgt_shape, mdtol) + result = np.moveaxis(result, [-2, -1], [y_dim, x_dim]) + + for _ in range(num_scalar_dims): + result = np.squeeze(result, axis=-1) + return result diff --git a/lib/iris/tests/experimental/regrid/test_regrid_area_weighted_rectilinear_src_and_grid.py b/lib/iris/tests/experimental/regrid/test_regrid_area_weighted_rectilinear_src_and_grid.py index 93b1a6d3e6..9190548b15 100644 --- a/lib/iris/tests/experimental/regrid/test_regrid_area_weighted_rectilinear_src_and_grid.py +++ b/lib/iris/tests/experimental/regrid/test_regrid_area_weighted_rectilinear_src_and_grid.py @@ -601,6 +601,20 @@ def test_circular_subset(self): @tests.skip_data def test_non_circular_subset(self): + """ + Test regridding behaviour when the source grid has circular latitude. + + This tests the specific case when the longitude coordinate of the + source grid has the `circular` attribute as `False` but otherwise spans + the full 360 degrees. + + Note: the previous behaviour was to always mask target cells when they + spanned the boundary of max/min longitude and `circular` was `False`, + however this has been changed so that such cells will only be masked + when there is a gap between max longitude and min longitude. In this + test these cells are expected to be unmasked and therefore the result + will be equal to the above test for circular longitudes. + """ src = iris.tests.stock.global_pp() src.coord("latitude").guess_bounds() src.coord("longitude").guess_bounds() @@ -619,9 +633,53 @@ def test_non_circular_subset(self): dest.add_dim_coord(dest_lat, 0) dest.add_dim_coord(dest_lon, 1) + res = regrid_area_weighted(src, dest) + self.assertArrayShapeStats(res, (40, 7), 285.653960, 15.212710) + + @tests.skip_data + def test__proper_non_circular_subset(self): + """ + Test regridding behaviour when the source grid has circular latitude. + + This tests the specific case when the longitude coordinate of the + source grid does not span the full 360 degrees. Target cells which span + the boundary of max/min longitude will contain a section which is out + of bounds from the source grid and are therefore expected to be masked. + """ + src = iris.tests.stock.global_pp() + src.coord("latitude").guess_bounds() + src.coord("longitude").guess_bounds() + src_lon_bounds = src.coord("longitude").bounds.copy() + # Leave a small gap between the first and last longitude value. + src_lon_bounds[0, 0] += 0.001 + src_lon = src.coord("longitude").copy( + points=src.coord("longitude").points, bounds=src_lon_bounds + ) + src.remove_coord("longitude") + src.add_dim_coord(src_lon, 1) + dest_lat = src.coord("latitude")[0:40] + dest_lon = iris.coords.DimCoord( + [-15.0, -10.0, -5.0, 0.0, 5.0, 10.0, 15.0], + standard_name="longitude", + units="degrees", + coord_system=dest_lat.coord_system, + ) + # Note target grid (in -180 to 180) src in 0 to 360 + dest_lon.guess_bounds() + data = np.zeros((dest_lat.shape[0], dest_lon.shape[0])) + dest = iris.cube.Cube(data) + dest.add_dim_coord(dest_lat, 0) + dest.add_dim_coord(dest_lon, 1) + res = regrid_area_weighted(src, dest) self.assertArrayShapeStats(res, (40, 7), 285.550814, 15.190245) + # The target cells straddling the gap between min and max source + # longitude should be masked. + expected_mask = np.zeros(res.shape) + expected_mask[:, 3] = 1 + assert np.array_equal(expected_mask, res.data.mask) + if __name__ == "__main__": tests.main() diff --git a/lib/iris/tests/unit/analysis/area_weighted/test_AreaWeightedRegridder.py b/lib/iris/tests/unit/analysis/area_weighted/test_AreaWeightedRegridder.py index 2d873ad011..789426e11b 100644 --- a/lib/iris/tests/unit/analysis/area_weighted/test_AreaWeightedRegridder.py +++ b/lib/iris/tests/unit/analysis/area_weighted/test_AreaWeightedRegridder.py @@ -50,7 +50,7 @@ def check_mdtol(self, mdtol=None): _regrid_info = _regrid_area_weighted_rectilinear_src_and_grid__prepare( src_grid, target_grid ) - self.assertEqual(len(_regrid_info), 10) + self.assertEqual(len(_regrid_info), 9) with mock.patch( "iris.analysis._area_weighted." "_regrid_area_weighted_rectilinear_src_and_grid__prepare",