From 8169cd8026495538c6a7e4b4459e32c77bc1344e Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 15 Oct 2023 22:38:23 -0600 Subject: [PATCH 1/6] Enable numbagg for reductions --- xarray/core/nputils.py | 31 ++++++++++++++++++++++++++++--- xarray/core/options.py | 7 +++++++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index c49a06dfc9c..b2c7a31bf13 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -18,11 +18,20 @@ try: import bottleneck as bn - _USE_BOTTLENECK = True + _BOTTLENECK_AVAILABLE = True except ImportError: # use numpy methods instead bn = np - _USE_BOTTLENECK = False + _BOTTLENECK_AVAILABLE = False + +try: + import numbagg + + _NUMBAGG_AVAILABLE = True +except ImportError: + # use numpy methods instead + numbagg = np + _NUMBAGG_AVAILABLE = False def _select_along_axis(values, idx, axis): @@ -165,9 +174,25 @@ def _create_bottleneck_method(name, npmodule=np): def f(values, axis=None, **kwargs): dtype = kwargs.get("dtype", None) bn_func = getattr(bn, name, None) + nba_func = getattr(numbagg, name, None) if ( - _USE_BOTTLENECK + _NUMBAGG_AVAILABLE + and OPTIONS["use_numbagg"] + and isinstance(values, np.ndarray) + and nba_func is not None + # numbagg uses ddof=1 only + and "var" not in name + # TODO: bool? + and values.dtype.kind in "uifc" + # and values.dtype.isnative + and (dtype is None or np.dtype(dtype) == values.dtype) + ): + # bottleneck does not take care dtype, min_count + kwargs.pop("dtype", None) + result = nba_func(values, axis=axis, **kwargs) + elif ( + _BOTTLENECK_AVAILABLE and OPTIONS["use_bottleneck"] and isinstance(values, np.ndarray) and bn_func is not None diff --git a/xarray/core/options.py b/xarray/core/options.py index a197cb4da10..118a67559ad 100644 --- a/xarray/core/options.py +++ b/xarray/core/options.py @@ -27,6 +27,7 @@ "keep_attrs", "warn_for_unclosed_files", "use_bottleneck", + "use_numbagg", "use_flox", ] @@ -50,6 +51,7 @@ class T_Options(TypedDict): warn_for_unclosed_files: bool use_bottleneck: bool use_flox: bool + use_numbagg: bool OPTIONS: T_Options = { @@ -72,6 +74,7 @@ class T_Options(TypedDict): "warn_for_unclosed_files": False, "use_bottleneck": True, "use_flox": True, + "use_numbagg": True, } _JOIN_OPTIONS = frozenset(["inner", "outer", "left", "right", "exact"]) @@ -98,6 +101,7 @@ def _positive_integer(value: int) -> bool: "file_cache_maxsize": _positive_integer, "keep_attrs": lambda choice: choice in [True, False, "default"], "use_bottleneck": lambda value: isinstance(value, bool), + "use_numbagg": lambda value: isinstance(value, bool), "use_flox": lambda value: isinstance(value, bool), "warn_for_unclosed_files": lambda value: isinstance(value, bool), } @@ -230,6 +234,9 @@ class set_options: use_flox : bool, default: True Whether to use ``numpy_groupies`` and `flox`` to accelerate groupby and resampling reductions. + use_numbagg : bool, default: True + Whether to use ``numbagg`` to accelerate reductions. + Takes precedence over ``use_bottleneck`` when both are True. warn_for_unclosed_files : bool, default: False Whether or not to issue a warning when unclosed files are deallocated. This is mostly useful for debugging. From f507678a80b8b4f0b05c3e2c0bf38f84c51c1386 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 15 Oct 2023 23:05:42 -0600 Subject: [PATCH 2/6] Skip std too --- xarray/core/nputils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index b2c7a31bf13..5051df81c9e 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -183,6 +183,7 @@ def f(values, axis=None, **kwargs): and nba_func is not None # numbagg uses ddof=1 only and "var" not in name + and "std" not in name # TODO: bool? and values.dtype.kind in "uifc" # and values.dtype.isnative From f65dfacf6dc26eb596c5f107d085806529de4afb Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 15 Oct 2023 23:19:20 -0600 Subject: [PATCH 3/6] Better ddof check --- xarray/core/nputils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 5051df81c9e..ca6e4858d8b 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -182,8 +182,7 @@ def f(values, axis=None, **kwargs): and isinstance(values, np.ndarray) and nba_func is not None # numbagg uses ddof=1 only - and "var" not in name - and "std" not in name + and (("var" in name or "std" not in name) and kwargs.get("ddof", 0) == 1) # TODO: bool? and values.dtype.kind in "uifc" # and values.dtype.isnative From 5b99ab1c9d44c1df5b4ab89679f1a7046a2401a4 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 15 Oct 2023 23:25:23 -0600 Subject: [PATCH 4/6] better ddof detection --- xarray/core/nputils.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index ca6e4858d8b..9678df41a7e 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -170,7 +170,7 @@ def __setitem__(self, key, value): self._array[key] = np.moveaxis(value, vindex_positions, mixed_positions) -def _create_bottleneck_method(name, npmodule=np): +def _create_method(name, npmodule=np): def f(values, axis=None, **kwargs): dtype = kwargs.get("dtype", None) bn_func = getattr(bn, name, None) @@ -182,14 +182,15 @@ def f(values, axis=None, **kwargs): and isinstance(values, np.ndarray) and nba_func is not None # numbagg uses ddof=1 only - and (("var" in name or "std" not in name) and kwargs.get("ddof", 0) == 1) + and (("var" in name or "std" in name) and kwargs.get("ddof", 0) == 1) # TODO: bool? and values.dtype.kind in "uifc" # and values.dtype.isnative and (dtype is None or np.dtype(dtype) == values.dtype) ): - # bottleneck does not take care dtype, min_count + # numbagg does not take care dtype, ddof kwargs.pop("dtype", None) + kwargs.pop("ddof", None) result = nba_func(values, axis=axis, **kwargs) elif ( _BOTTLENECK_AVAILABLE @@ -258,14 +259,14 @@ def least_squares(lhs, rhs, rcond=None, skipna=False): return coeffs, residuals -nanmin = _create_bottleneck_method("nanmin") -nanmax = _create_bottleneck_method("nanmax") -nanmean = _create_bottleneck_method("nanmean") -nanmedian = _create_bottleneck_method("nanmedian") -nanvar = _create_bottleneck_method("nanvar") -nanstd = _create_bottleneck_method("nanstd") -nanprod = _create_bottleneck_method("nanprod") -nancumsum = _create_bottleneck_method("nancumsum") -nancumprod = _create_bottleneck_method("nancumprod") -nanargmin = _create_bottleneck_method("nanargmin") -nanargmax = _create_bottleneck_method("nanargmax") +nanmin = _create_method("nanmin") +nanmax = _create_method("nanmax") +nanmean = _create_method("nanmean") +nanmedian = _create_method("nanmedian") +nanvar = _create_method("nanvar") +nanstd = _create_method("nanstd") +nanprod = _create_method("nanprod") +nancumsum = _create_method("nancumsum") +nancumprod = _create_method("nancumprod") +nanargmin = _create_method("nanargmin") +nanargmax = _create_method("nanargmax") From 9a2599d7cd3717631bd6ec9ec32b175d218d40d9 Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Sun, 15 Oct 2023 23:33:37 -0600 Subject: [PATCH 5/6] numbagg uses ddof=0 as default! --- xarray/core/nputils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 9678df41a7e..244f5d9faf6 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -182,7 +182,7 @@ def f(values, axis=None, **kwargs): and isinstance(values, np.ndarray) and nba_func is not None # numbagg uses ddof=1 only - and (("var" in name or "std" in name) and kwargs.get("ddof", 0) == 1) + and (("var" in name or "std" in name) and kwargs.get("ddof", 1) == 0) # TODO: bool? and values.dtype.kind in "uifc" # and values.dtype.isnative From d42128ae7aba486e88e4536e250490abdefed2af Mon Sep 17 00:00:00 2001 From: Deepak Cherian Date: Tue, 17 Oct 2023 12:12:19 -0600 Subject: [PATCH 6/6] Min version 0.5.0 --- xarray/core/nputils.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/xarray/core/nputils.py b/xarray/core/nputils.py index 244f5d9faf6..316a77ead6a 100644 --- a/xarray/core/nputils.py +++ b/xarray/core/nputils.py @@ -5,6 +5,7 @@ import numpy as np import pandas as pd from numpy.core.multiarray import normalize_axis_index # type: ignore[attr-defined] +from packaging.version import Version # remove once numpy 2.0 is the oldest supported version try: @@ -27,11 +28,11 @@ try: import numbagg - _NUMBAGG_AVAILABLE = True + _HAS_NUMBAGG = Version(numbagg.__version__) >= Version("0.5.0") except ImportError: # use numpy methods instead numbagg = np - _NUMBAGG_AVAILABLE = False + _HAS_NUMBAGG = False def _select_along_axis(values, idx, axis): @@ -177,12 +178,12 @@ def f(values, axis=None, **kwargs): nba_func = getattr(numbagg, name, None) if ( - _NUMBAGG_AVAILABLE + _HAS_NUMBAGG and OPTIONS["use_numbagg"] and isinstance(values, np.ndarray) and nba_func is not None - # numbagg uses ddof=1 only - and (("var" in name or "std" in name) and kwargs.get("ddof", 1) == 0) + # numbagg uses ddof=1 only, but numpy uses ddof=0 by default + and (("var" in name or "std" in name) and kwargs.get("ddof", 0) == 1) # TODO: bool? and values.dtype.kind in "uifc" # and values.dtype.isnative