Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ideas for micro optimization #29

Open
hmaarrfk opened this issue May 5, 2024 · 1 comment
Open

Ideas for micro optimization #29

hmaarrfk opened this issue May 5, 2024 · 1 comment

Comments

@hmaarrfk
Copy link
Owner

hmaarrfk commented May 5, 2024

What is your issue?

the call to self._parse_dimensions(dims) should be avoidable for internal routines.

However, it was giving me trouble in tests.

diff --git a/xarray/core/variable.py b/xarray/core/variable.py
index 6f5826a5..311aac22 100644
--- a/xarray/core/variable.py
+++ b/xarray/core/variable.py
@@ -396,7 +396,10 @@ class Variable(NamedArray, AbstractArray, VariableArithmetic):
             unrecognized encoding items.
         """
         super().__init__(
-            dims=dims, data=as_compatible_data(data, fastpath=fastpath), attrs=attrs
+            dims=dims,
+            data=as_compatible_data(data, fastpath=fastpath),
+            attrs=attrs,
+            fastpath=fastpath,
         )

         self._encoding = None
@@ -2622,6 +2625,9 @@ class IndexVariable(Variable):

     def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
         super().__init__(dims, data, attrs, encoding, fastpath)
+        if fastpath:
+            return
+
         if self.ndim != 1:
             raise ValueError(f"{type(self).__name__} objects must be 1-dimensional")

diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py
index 135dabc0..adceaa65 100644
--- a/xarray/namedarray/core.py
+++ b/xarray/namedarray/core.py
@@ -259,9 +259,14 @@ class NamedArray(NamedArrayAggregations, Generic[_ShapeType_co, _DType_co]):
         dims: _DimsLike,
         data: duckarray[Any, _DType_co],
         attrs: _AttrsLike = None,
+        *,
+        fastpath=True,
     ):
         self._data = data
-        self._dims = self._parse_dimensions(dims)
+        if fastpath:
+            self._dims = dims
+        else:
+            self._dims = self._parse_dimensions(dims)
         self._attrs = dict(attrs) if attrs else None

     def __init_subclass__(cls, **kwargs: Any) -> None:
@hmaarrfk
Copy link
Owner Author

hmaarrfk commented May 5, 2024

_apply_index is rather slow because of pandas multi index compatibility.

I'm not sure how to fix that:

From 1a659bcf8e0eced8e4309621f079a7f0ddf22fac Mon Sep 17 00:00:00 2001
From: Mark Harfouche <[email protected]>
Date: Sun, 5 May 2024 14:52:25 -0400
Subject: [PATCH] 12k->14.5k

---
 xarray/core/indexes.py  | 69 ++++++++++++++++++++++++++++++++++-------
 xarray/core/indexing.py |  7 +++--
 2 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/xarray/core/indexes.py b/xarray/core/indexes.py
index e71c4a6f073..22bdb0777d2 100644
--- a/xarray/core/indexes.py
+++ b/xarray/core/indexes.py
@@ -161,7 +161,10 @@ def unstack(self) -> tuple[dict[Hashable, Index], pd.MultiIndex]:
         raise NotImplementedError()
 
     def create_variables(
-        self, variables: Mapping[Any, Variable] | None = None
+        self,
+        variables: Mapping[Any, Variable] | None = None,
+        *,
+        fastpath=False,
     ) -> IndexVars:
         """Maybe create new coordinate variables from this index.
 
@@ -575,13 +578,19 @@ class PandasIndex(Index):
 
     __slots__ = ("index", "dim", "coord_dtype")
 
-    def __init__(self, array: Any, dim: Hashable, coord_dtype: Any = None):
-        # make a shallow copy: cheap and because the index name may be updated
-        # here or in other constructors (cannot use pd.Index.rename as this
-        # constructor is also called from PandasMultiIndex)
-        index = safe_cast_to_index(array).copy()
+    def __init__(
+        self, array: Any, dim: Hashable, coord_dtype: Any = None, *, fastpath=False
+    ):
+        if fastpath:
+            index = array
+        else:
+            index = safe_cast_to_index(array)
 
         if index.name is None:
+            # make a shallow copy: cheap and because the index name may be updated
+            # here or in other constructors (cannot use pd.Index.rename as this
+            # constructor is also called from PandasMultiIndex)
+            index = index.copy()
             index.name = dim
 
         self.index = index
@@ -596,7 +605,7 @@ def _replace(self, index, dim=None, coord_dtype=None):
             dim = self.dim
         if coord_dtype is None:
             coord_dtype = self.coord_dtype
-        return type(self)(index, dim, coord_dtype)
+        return type(self)(index, dim, coord_dtype, fastpath=True)
 
     @classmethod
     def from_variables(
@@ -641,6 +650,8 @@ def from_variables(
 
         obj = cls(data, dim, coord_dtype=var.dtype)
         assert not isinstance(obj.index, pd.MultiIndex)
+        # Rename safely
+        obj.index = obj.index.copy()
         obj.index.name = name
 
         return obj
@@ -684,7 +695,7 @@ def concat(
         return cls(new_pd_index, dim=dim, coord_dtype=coord_dtype)
 
     def create_variables(
-        self, variables: Mapping[Any, Variable] | None = None
+        self, variables: Mapping[Any, Variable] | None = None, *, fastpath=False
     ) -> IndexVars:
         from xarray.core.variable import IndexVariable
 
@@ -701,7 +712,9 @@ def create_variables(
             encoding = None
 
         data = PandasIndexingAdapter(self.index, dtype=self.coord_dtype)
-        var = IndexVariable(self.dim, data, attrs=attrs, encoding=encoding)
+        var = IndexVariable(
+            self.dim, data, attrs=attrs, encoding=encoding, fastpath=fastpath
+        )
         return {name: var}
 
     def to_pandas_index(self) -> pd.Index:
@@ -1122,7 +1135,7 @@ def reorder_levels(
         return self._replace(index, level_coords_dtype=level_coords_dtype)
 
     def create_variables(
-        self, variables: Mapping[Any, Variable] | None = None
+        self, variables: Mapping[Any, Variable] | None = None, *, fastpath=False
     ) -> IndexVars:
         from xarray.core.variable import IndexVariable
 
@@ -1772,6 +1785,37 @@ def check_variables():
     return not not_equal
 
 
+def _apply_indexes_fast(indexes: Indexes[Index], args: Mapping[Any, Any], func: str):
+    # This function avoids the call to indexes.group_by_index
+    # which is really slow when repeatidly iterating through
+    # an array. However, it fails to return the correct ID for
+    # multi-index arrays
+    indexes_fast, coords = indexes._indexes, indexes._variables
+
+    new_indexes: dict[Hashable, Index] = {k: v for k, v in indexes_fast.items()}
+    new_index_variables: dict[Hashable, Variable] = {}
+    for name, index in indexes_fast.items():
+        coord = coords[name]
+        if hasattr(coord, "_indexes"):
+            index_vars = {n: coords[n] for n in coord._indexes}
+        else:
+            index_vars = {name: coord}
+        index_dims = {d for var in index_vars.values() for d in var.dims}
+        index_args = {k: v for k, v in args.items() if k in index_dims}
+
+        if index_args:
+            new_index = getattr(index, func)(index_args)
+            if new_index is not None:
+                new_indexes.update({k: new_index for k in index_vars})
+                new_index_vars = new_index.create_variables(index_vars, fastpath=True)
+                new_index_variables.update(new_index_vars)
+                new_index_variables.update(new_index_vars)
+            else:
+                for k in index_vars:
+                    new_indexes.pop(k, None)
+    return new_indexes, new_index_variables
+
+
 def _apply_indexes(
     indexes: Indexes[Index],
     args: Mapping[Any, Any],
@@ -1800,7 +1844,10 @@ def isel_indexes(
     indexes: Indexes[Index],
     indexers: Mapping[Any, Any],
 ) -> tuple[dict[Hashable, Index], dict[Hashable, Variable]]:
-    return _apply_indexes(indexes, indexers, "isel")
+    if any(isinstance(v, PandasMultiIndex) for v in indexes._indexes.values()):
+        return _apply_indexes(indexes, indexers, "isel")
+    else:
+        return _apply_indexes_fast(indexes, indexers, "isel")
 
 
 def roll_indexes(
diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py
index 4909d506d7b..02f1fb262c5 100644
--- a/xarray/core/indexing.py
+++ b/xarray/core/indexing.py
@@ -1662,10 +1662,13 @@ class PandasIndexingAdapter(ExplicitlyIndexedNDArrayMixin):
 
     __slots__ = ("array", "_dtype")
 
-    def __init__(self, array: pd.Index, dtype: DTypeLike = None):
+    def __init__(self, array: pd.Index, dtype: DTypeLike = None, *, fastpath=False):
         from xarray.core.indexes import safe_cast_to_index
 
-        self.array = safe_cast_to_index(array)
+        if fastpath:
+            self.array = array
+        else:
+            self.array = safe_cast_to_index(array)
 
         if dtype is None:
             self._dtype = get_valid_numpy_dtype(array)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant