diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 264c07f562b..431ad0b5458 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,6 +33,13 @@ New Features - Added zarr backends for :py:func:`open_groups` (:issue:`9430`, :pull:`9469`). By `Eni Awowale `_. +Performance +~~~~~~~~~~~ + +- Small optimizations to help reduce indexing speed of datasets (:pull:`9003`). + By `Mark Harfouche `_. + + Breaking changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 08885e3cd8d..6ca90a2c1e4 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -3039,20 +3039,40 @@ def isel( coord_names = self._coord_names.copy() indexes, index_variables = isel_indexes(self.xindexes, indexers) + all_keys = set(indexers.keys()) for name, var in self._variables.items(): # preserve variable order if name in index_variables: var = index_variables[name] - else: - var_indexers = {k: v for k, v in indexers.items() if k in var.dims} - if var_indexers: + dims.update(zip(var.dims, var.shape)) + # Fastpath, skip all this metadata analysis for variables + # with no dimensions + # Keep the result of var.dims cached for future accesss to it + # + # Optimization Note from hmaarrfk - 2024/06 + # https://github.com/pydata/xarray/pull/9003#discussion_r1592767493 + # It was found that accessing var.dims is faster than + # using var.shape or var.ndim since resolving both is typically + # left to the underlying array that each Xarray structure wraps. + # By using var.dims, we can avoid the cost of resolving the + # underlying array's shape and ndim since the dims are already + # cached by the Variable + elif len(var_dims := var.dims): + # Large datasets with alot of metadata may have many scalars + # without any relevant dimensions for slicing. + # Pick those out quickly and avoid paying the cost below + # of resolving the var_indexers variables + if var_indexer_keys := all_keys.intersection(var_dims): + var_indexers = {k: indexers[k] for k in var_indexer_keys} var = var.isel(var_indexers) if drop and var.ndim == 0 and name in coord_names: coord_names.remove(name) continue + # Update our reference to `var_dims` after the call to isel + var_dims = var.dims + dims.update(zip(var_dims, var.shape)) variables[name] = var - dims.update(zip(var.dims, var.shape, strict=True)) return self._construct_direct( variables=variables,