From 5a7c9039c80803c6d915066566cfd246ab334dd8 Mon Sep 17 00:00:00 2001
From: Mark Harfouche <mark.harfouche@gmail.com>
Date: Sun, 5 May 2024 21:41:58 -0400
Subject: [PATCH] Micro optimize dataset.isel for speed on large datasets

This targets optimization for datasets with many "scalar" variables
(that is variables without any dimensions). This can happen in the
context where you have many pieces of small metadata that relate to
various facts about an experimental condition.

For example, we have about 80 of these in our datasets (and I want to
incrase this number)

Our datasets are quite large (On the order of 1TB uncompresed) so we
often have one dimension that is in the 10's of thousands.

However, it has become quite slow to index in the dataset.

We therefore often "carefully slice out the matadata we need" prior to
doing anything with our dataset, but that isn't quite possible with you
want to orchestrate things with a parent application.

These optimizations are likely "minor" but considering the results of
the benchmark, I think they are quite worthwhile:

* main (as of #9001) - 2.5k its/s
* With #9002 - 4.2k its/s
* With this Pull Request (on top of #9002) -- 6.1k its/s

Thanks for considering.
---
 xarray/core/dataset.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
index 872cb482fe8..fe430aac860 100644
--- a/xarray/core/dataset.py
+++ b/xarray/core/dataset.py
@@ -2981,20 +2981,30 @@ def isel(
         coord_names = self._coord_names.copy()
 
         indexes, index_variables = isel_indexes(self.xindexes, indexers)
+        all_keys = set(indexers.keys())
 
         for name, var in self._variables.items():
             # preserve variable order
             if name in index_variables:
                 var = index_variables[name]
-            else:
-                var_indexers = {k: v for k, v in indexers.items() if k in var.dims}
-                if var_indexers:
+                dims.update(zip(var.dims, var.shape))
+            # Fastpath, skip all of this for variables with no dimensions
+            # Keep the result cached for future dictionary update
+            elif var_dims := var.dims:
+                # Large datasets with alot of metadata may have many scalars
+                # without any relevant dimensions for slicing.
+                # Pick those out quickly and avoid paying the cost below
+                # of resolving the var_indexers variables
+                if var_indexer_keys := all_keys.intersection(var_dims):
+                    var_indexers = {k: indexers[k] for k in var_indexer_keys}
                     var = var.isel(var_indexers)
                     if drop and var.ndim == 0 and name in coord_names:
                         coord_names.remove(name)
                         continue
+                    # Update our reference to `var_dims` after the call to isel
+                    var_dims = var.dims
+                dims.update(zip(var_dims, var.shape))
             variables[name] = var
-            dims.update(zip(var.dims, var.shape))
 
         return self._construct_direct(
             variables=variables,