Ideas for micro optimization #29

hmaarrfk · 2024-05-05T22:49:00Z

What is your issue?

the call to self._parse_dimensions(dims) should be avoidable for internal routines.

However, it was giving me trouble in tests.

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index 6f5826a5..311aac22 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -396,7 +396,10 @@ class Variable(NamedArray, AbstractArray, VariableArithmetic):
             unrecognized encoding items.
         """
         super().__init__(
-            dims=dims, data=as_compatible_data(data, fastpath=fastpath), attrs=attrs
+            dims=dims,
+            data=as_compatible_data(data, fastpath=fastpath),
+            attrs=attrs,
+            fastpath=fastpath,
         )

         self._encoding = None
@@ -2622,6 +2625,9 @@ class IndexVariable(Variable):

     def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
         super().__init__(dims, data, attrs, encoding, fastpath)
+        if fastpath:
+            return
+
         if self.ndim != 1:
             raise ValueError(f"{type(self).__name__} objects must be 1-dimensional")

diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py
index 135dabc0..adceaa65 100644
--- a/xarray/namedarray/core.py
+++ b/xarray/namedarray/core.py
@@ -259,9 +259,14 @@ class NamedArray(NamedArrayAggregations, Generic[_ShapeType_co, _DType_co]):
         dims: _DimsLike,
         data: duckarray[Any, _DType_co],
         attrs: _AttrsLike = None,
+        *,
+        fastpath=True,
     ):
         self._data = data
-        self._dims = self._parse_dimensions(dims)
+        if fastpath:
+            self._dims = dims
+        else:
+            self._dims = self._parse_dimensions(dims)
         self._attrs = dict(attrs) if attrs else None

     def __init_subclass__(cls, **kwargs: Any) -> None:

The text was updated successfully, but these errors were encountered:

hmaarrfk · 2024-05-05T23:02:39Z

_apply_index is rather slow because of pandas multi index compatibility.

I'm not sure how to fix that:

From 1a659bcf8e0eced8e4309621f079a7f0ddf22fac Mon Sep 17 00:00:00 2001
From: Mark Harfouche <[email protected]>
Date: Sun, 5 May 2024 14:52:25 -0400
Subject: [PATCH] 12k->14.5k

---
 xarray/core/indexes.py  | 69 ++++++++++++++++++++++++++++++++++-------
 xarray/core/indexing.py |  7 +++--
 2 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
index e71c4a6f073..22bdb0777d2 100644
--- a/xarray/core/indexes.py
+++ b/xarray/core/indexes.py
@@ -161,7 +161,10 @@ def unstack(self) -> tuple[dict[Hashable, Index], pd.MultiIndex]:
         raise NotImplementedError()
 
     def create_variables(
-        self, variables: Mapping[Any, Variable] | None = None
+        self,
+        variables: Mapping[Any, Variable] | None = None,
+        *,
+        fastpath=False,
     ) -> IndexVars:
         """Maybe create new coordinate variables from this index.
 
@@ -575,13 +578,19 @@ class PandasIndex(Index):
 
     __slots__ = ("index", "dim", "coord_dtype")
 
-    def __init__(self, array: Any, dim: Hashable, coord_dtype: Any = None):
-        # make a shallow copy: cheap and because the index name may be updated
-        # here or in other constructors (cannot use pd.Index.rename as this
-        # constructor is also called from PandasMultiIndex)
-        index = safe_cast_to_index(array).copy()
+    def __init__(
+        self, array: Any, dim: Hashable, coord_dtype: Any = None, *, fastpath=False
+    ):
+        if fastpath:
+            index = array
+        else:
+            index = safe_cast_to_index(array)
 
         if index.name is None:
+            # make a shallow copy: cheap and because the index name may be updated
+            # here or in other constructors (cannot use pd.Index.rename as this
+            # constructor is also called from PandasMultiIndex)
+            index = index.copy()
             index.name = dim
 
         self.index = index
@@ -596,7 +605,7 @@ def _replace(self, index, dim=None, coord_dtype=None):
             dim = self.dim
         if coord_dtype is None:
             coord_dtype = self.coord_dtype
-        return type(self)(index, dim, coord_dtype)
+        return type(self)(index, dim, coord_dtype, fastpath=True)
 
     @classmethod
     def from_variables(
@@ -641,6 +650,8 @@ def from_variables(
 
         obj = cls(data, dim, coord_dtype=var.dtype)
         assert not isinstance(obj.index, pd.MultiIndex)
+        # Rename safely
+        obj.index = obj.index.copy()
         obj.index.name = name
 
         return obj
@@ -684,7 +695,7 @@ def concat(
         return cls(new_pd_index, dim=dim, coord_dtype=coord_dtype)
 
     def create_variables(
-        self, variables: Mapping[Any, Variable] | None = None
+        self, variables: Mapping[Any, Variable] | None = None, *, fastpath=False
     ) -> IndexVars:
         from xarray.core.variable import IndexVariable
 
@@ -701,7 +712,9 @@ def create_variables(
             encoding = None
 
         data = PandasIndexingAdapter(self.index, dtype=self.coord_dtype)
-        var = IndexVariable(self.dim, data, attrs=attrs, encoding=encoding)
+        var = IndexVariable(
+            self.dim, data, attrs=attrs, encoding=encoding, fastpath=fastpath
+        )
         return {name: var}
 
     def to_pandas_index(self) -> pd.Index:
@@ -1122,7 +1135,7 @@ def reorder_levels(
         return self._replace(index, level_coords_dtype=level_coords_dtype)
 
     def create_variables(
-        self, variables: Mapping[Any, Variable] | None = None
+        self, variables: Mapping[Any, Variable] | None = None, *, fastpath=False
     ) -> IndexVars:
         from xarray.core.variable import IndexVariable
 
@@ -1772,6 +1785,37 @@ def check_variables():
     return not not_equal
 
 
+def _apply_indexes_fast(indexes: Indexes[Index], args: Mapping[Any, Any], func: str):
+    # This function avoids the call to indexes.group_by_index
+    # which is really slow when repeatidly iterating through
+    # an array. However, it fails to return the correct ID for
+    # multi-index arrays
+    indexes_fast, coords = indexes._indexes, indexes._variables
+
+    new_indexes: dict[Hashable, Index] = {k: v for k, v in indexes_fast.items()}
+    new_index_variables: dict[Hashable, Variable] = {}
+    for name, index in indexes_fast.items():
+        coord = coords[name]
+        if hasattr(coord, "_indexes"):
+            index_vars = {n: coords[n] for n in coord._indexes}
+        else:
+            index_vars = {name: coord}
+        index_dims = {d for var in index_vars.values() for d in var.dims}
+        index_args = {k: v for k, v in args.items() if k in index_dims}
+
+        if index_args:
+            new_index = getattr(index, func)(index_args)
+            if new_index is not None:
+                new_indexes.update({k: new_index for k in index_vars})
+                new_index_vars = new_index.create_variables(index_vars, fastpath=True)
+                new_index_variables.update(new_index_vars)
+                new_index_variables.update(new_index_vars)
+            else:
+                for k in index_vars:
+                    new_indexes.pop(k, None)
+    return new_indexes, new_index_variables
+
+
 def _apply_indexes(
     indexes: Indexes[Index],
     args: Mapping[Any, Any],
@@ -1800,7 +1844,10 @@ def isel_indexes(
     indexes: Indexes[Index],
     indexers: Mapping[Any, Any],
 ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]:
-    return _apply_indexes(indexes, indexers, "isel")
+    if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()):
+        return _apply_indexes(indexes, indexers, "isel")
+    else:
+        return _apply_indexes_fast(indexes, indexers, "isel")
 
 
 def roll_indexes(
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
index 4909d506d7b..02f1fb262c5 100644
--- a/xarray/core/indexing.py
+++ b/xarray/core/indexing.py
@@ -1662,10 +1662,13 @@ class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin):
 
     __slots__ = ("array", "_dtype")
 
-    def __init__(self, array: pd.Index, dtype: DTypeLike = None):
+    def __init__(self, array: pd.Index, dtype: DTypeLike = None, *, fastpath=False):
         from xarray.core.indexes import safe_cast_to_index
 
-        self.array = safe_cast_to_index(array)
+        if fastpath:
+            self.array = array
+        else:
+            self.array = safe_cast_to_index(array)
 
         if dtype is None:
             self._dtype = get_valid_numpy_dtype(array)

hmaarrfk mentioned this issue May 5, 2024

Micro optimizations to improve indexing pydata/xarray#9002

Merged

4 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Ideas for micro optimization #29

Ideas for micro optimization #29

hmaarrfk commented May 5, 2024

hmaarrfk commented May 5, 2024

Ideas for micro optimization #29

Ideas for micro optimization #29

Comments

hmaarrfk commented May 5, 2024

What is your issue?

hmaarrfk commented May 5, 2024