From c3b5ead8cdbce38157265fd449a2a641cc118066 Mon Sep 17 00:00:00 2001 From: Anderson Banihirwe <13301940+andersy005@users.noreply.github.com> Date: Wed, 27 Sep 2023 08:23:20 -0700 Subject: [PATCH] initial refactor for NamedArray (#8075) * initial prototype for NamedArray * move NDArrayMixin and NdimSizeLenMixin inside named_array * vendor is_duck_dask_array * vendor Frozen object * update import * move _default sentinel value * rename subpackage to namedarray per @TomNicholas suggestion * Remove NdimSizeLenMixin * fix typing * add annotations * Remove NDArrayMixin * Apply suggestions from code review Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> * fix typing * fix return type * revert NDArrayMixin * [WIP] as_compatible_data refactor * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * duplicate sentinel value and leave the original sentinel object alone * Apply suggestions from code review Co-authored-by: Stephan Hoyer * use DuckArray * Apply suggestions from code review Co-authored-by: Stephan Hoyer * use sentinel value from xarray * remove unused code * fix variable constructor * fix as_compatible_data utility function * move _to_dense and _non_zero to NamedArray * more typing * add initial tests * Apply suggestions from code review Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * attempt to fix some mypy errors * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update core.py * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update core.py * All input data can be arraylike * Update core.py * Update core.py * get and set attrs at the same level. * data doesn't have to be ndarray * avoid redefining typing use new variable names instead * import on runtime as well to be able to cast * requires ufunc and function to be a valid duck array * Add array_namespace * Update test_dataset.py * Update test_dataset.py * remove Frozen * update tests * update tests * switch to functional API * add fastpath * Test making sizes dict[Hashable, int] * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * A lot of errors... Try Mapping instead * Update groupby.py * Update types.py * Apply suggestions from code review Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> Co-authored-by: Deepak Cherian * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * update docstrings * update error messages * update tests * test explicitly index array * update tests * remove unused types * Update xarray/tests/test_namedarray.py Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * use Self --------- Co-authored-by: Illviljan <14371165+Illviljan@users.noreply.github.com> Co-authored-by: dcherian Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Stephan Hoyer Co-authored-by: Deepak Cherian --- xarray/core/common.py | 2 +- xarray/core/groupby.py | 4 +- xarray/core/types.py | 2 +- xarray/core/variable.py | 297 +-------------------- xarray/namedarray/__init__.py | 0 xarray/namedarray/core.py | 447 ++++++++++++++++++++++++++++++++ xarray/namedarray/utils.py | 68 +++++ xarray/tests/test_dataset.py | 6 +- xarray/tests/test_formatting.py | 2 +- xarray/tests/test_namedarray.py | 165 ++++++++++++ 10 files changed, 700 insertions(+), 293 deletions(-) create mode 100644 xarray/namedarray/__init__.py create mode 100644 xarray/namedarray/core.py create mode 100644 xarray/namedarray/utils.py create mode 100644 xarray/tests/test_namedarray.py diff --git a/xarray/core/common.py b/xarray/core/common.py index e4e3e60e815..db9b2aead23 100644 --- a/xarray/core/common.py +++ b/xarray/core/common.py @@ -223,7 +223,7 @@ def _get_axis_num(self: Any, dim: Hashable) -> int: raise ValueError(f"{dim!r} not found in array dimensions {self.dims!r}") @property - def sizes(self: Any) -> Frozen[Hashable, int]: + def sizes(self: Any) -> Mapping[Hashable, int]: """Ordered mapping from dimension names to lengths. Immutable. diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 9894a4a4daf..e9ddf044568 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -699,7 +699,7 @@ class GroupBy(Generic[T_Xarray]): _groups: dict[GroupKey, GroupIndex] | None _dims: tuple[Hashable, ...] | Frozen[Hashable, int] | None - _sizes: Frozen[Hashable, int] | None + _sizes: Mapping[Hashable, int] | None def __init__( self, @@ -746,7 +746,7 @@ def __init__( self._sizes = None @property - def sizes(self) -> Frozen[Hashable, int]: + def sizes(self) -> Mapping[Hashable, int]: """Ordered mapping from dimension names to lengths. Immutable. diff --git a/xarray/core/types.py b/xarray/core/types.py index 073121b13b1..bbcda7ca240 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -106,7 +106,7 @@ def dims(self) -> Frozen[Hashable, int] | tuple[Hashable, ...]: ... @property - def sizes(self) -> Frozen[Hashable, int]: + def sizes(self) -> Mapping[Hashable, int]: ... @property diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 2571b093450..0e6e45d4929 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -26,10 +26,7 @@ as_indexable, ) from xarray.core.options import OPTIONS, _get_keep_attrs -from xarray.core.parallelcompat import ( - get_chunked_array_type, - guess_chunkmanager, -) +from xarray.core.parallelcompat import get_chunked_array_type, guess_chunkmanager from xarray.core.pycompat import ( array_type, integer_types, @@ -38,8 +35,6 @@ is_duck_dask_array, ) from xarray.core.utils import ( - Frozen, - NdimSizeLenMixin, OrderedSet, _default, decode_numpy_dict_values, @@ -50,6 +45,7 @@ is_duck_array, maybe_coerce_to_str, ) +from xarray.namedarray.core import NamedArray NON_NUMPY_SUPPORTED_ARRAY_TYPES = ( indexing.ExplicitlyIndexed, @@ -268,7 +264,7 @@ def as_compatible_data( data = np.timedelta64(getattr(data, "value", data), "ns") # we don't want nested self-described arrays - if isinstance(data, (pd.Series, pd.Index, pd.DataFrame)): + if isinstance(data, (pd.Series, pd.DataFrame)): data = data.values if isinstance(data, np.ma.MaskedArray): @@ -315,7 +311,7 @@ def _as_array_or_item(data): return data -class Variable(AbstractArray, NdimSizeLenMixin, VariableArithmetic): +class Variable(NamedArray, AbstractArray, VariableArithmetic): """A netcdf-like variable consisting of dimensions, data and attributes which describe a single Array. A single Variable object is not fully described outside the context of its parent Dataset (if you want such a @@ -365,51 +361,14 @@ def __init__( Well-behaved code to serialize a Variable should ignore unrecognized encoding items. """ - self._data: T_DuckArray = as_compatible_data(data, fastpath=fastpath) - self._dims = self._parse_dimensions(dims) - self._attrs: dict[Any, Any] | None = None + super().__init__( + dims=dims, data=as_compatible_data(data, fastpath=fastpath), attrs=attrs + ) + self._encoding = None - if attrs is not None: - self.attrs = attrs if encoding is not None: self.encoding = encoding - @property - def dtype(self) -> np.dtype: - """ - Data-type of the array’s elements. - - See Also - -------- - ndarray.dtype - numpy.dtype - """ - return self._data.dtype - - @property - def shape(self) -> tuple[int, ...]: - """ - Tuple of array dimensions. - - See Also - -------- - numpy.ndarray.shape - """ - return self._data.shape - - @property - def nbytes(self) -> int: - """ - Total bytes consumed by the elements of the data array. - - If the underlying data array does not include ``nbytes``, estimates - the bytes consumed based on the ``size`` and ``dtype``. - """ - if hasattr(self._data, "nbytes"): - return self._data.nbytes - else: - return self.size * self.dtype.itemsize - @property def _in_memory(self): return isinstance( @@ -441,11 +400,7 @@ def data(self): @data.setter def data(self, data: T_DuckArray | ArrayLike) -> None: data = as_compatible_data(data) - if data.shape != self.shape: # type: ignore[attr-defined] - raise ValueError( - f"replacement data must match the Variable's shape. " - f"replacement data has shape {data.shape}; Variable has shape {self.shape}" # type: ignore[attr-defined] - ) + self._check_shape(data) self._data = data def astype( @@ -571,41 +526,6 @@ def compute(self, **kwargs): new = self.copy(deep=False) return new.load(**kwargs) - def __dask_tokenize__(self): - # Use v.data, instead of v._data, in order to cope with the wrappers - # around NetCDF and the like - from dask.base import normalize_token - - return normalize_token((type(self), self._dims, self.data, self.attrs)) - - def __dask_graph__(self): - if is_duck_dask_array(self._data): - return self._data.__dask_graph__() - else: - return None - - def __dask_keys__(self): - return self._data.__dask_keys__() - - def __dask_layers__(self): - return self._data.__dask_layers__() - - @property - def __dask_optimize__(self): - return self._data.__dask_optimize__ - - @property - def __dask_scheduler__(self): - return self._data.__dask_scheduler__ - - def __dask_postcompute__(self): - array_func, array_args = self._data.__dask_postcompute__() - return self._dask_finalize, (array_func,) + array_args - - def __dask_postpersist__(self): - array_func, array_args = self._data.__dask_postpersist__() - return self._dask_finalize, (array_func,) + array_args - def _dask_finalize(self, results, array_func, *args, **kwargs): data = array_func(results, *args, **kwargs) return Variable(self._dims, data, attrs=self._attrs, encoding=self._encoding) @@ -667,27 +587,6 @@ def to_dict( return item - @property - def dims(self) -> tuple[Hashable, ...]: - """Tuple of dimension names with which this variable is associated.""" - return self._dims - - @dims.setter - def dims(self, value: str | Iterable[Hashable]) -> None: - self._dims = self._parse_dimensions(value) - - def _parse_dimensions(self, dims: str | Iterable[Hashable]) -> tuple[Hashable, ...]: - if isinstance(dims, str): - dims = (dims,) - else: - dims = tuple(dims) - if len(dims) != self.ndim: - raise ValueError( - f"dimensions {dims} must have the same length as the " - f"number of data dimensions, ndim={self.ndim}" - ) - return dims - def _item_key_to_tuple(self, key): if utils.is_dict_like(key): return tuple(key.get(dim, slice(None)) for dim in self.dims) @@ -820,13 +719,6 @@ def _broadcast_indexes_outer(self, key): return dims, OuterIndexer(tuple(new_key)), None - def _nonzero(self): - """Equivalent numpy's nonzero but returns a tuple of Variables.""" - # TODO we should replace dask's native nonzero - # after https://github.com/dask/dask/issues/1076 is implemented. - nonzeros = np.nonzero(self.data) - return tuple(Variable((dim), nz) for nz, dim in zip(nonzeros, self.dims)) - def _broadcast_indexes_vectorized(self, key): variables = [] out_dims_set = OrderedSet() @@ -976,17 +868,6 @@ def __setitem__(self, key, value): indexable = as_indexable(self._data) indexable[index_tuple] = value - @property - def attrs(self) -> dict[Any, Any]: - """Dictionary of local attributes on this variable.""" - if self._attrs is None: - self._attrs = {} - return self._attrs - - @attrs.setter - def attrs(self, value: Mapping[Any, Any]) -> None: - self._attrs = dict(value) - @property def encoding(self) -> dict[Any, Any]: """Dictionary of encodings on this variable.""" @@ -1005,66 +886,6 @@ def reset_encoding(self) -> Self: """Return a new Variable without encoding.""" return self._replace(encoding={}) - def copy( - self, deep: bool = True, data: T_DuckArray | ArrayLike | None = None - ) -> Self: - """Returns a copy of this object. - - If `deep=True`, the data array is loaded into memory and copied onto - the new object. Dimensions, attributes and encodings are always copied. - - Use `data` to create a new object with the same structure as - original but entirely new data. - - Parameters - ---------- - deep : bool, default: True - Whether the data array is loaded into memory and copied onto - the new object. Default is True. - data : array_like, optional - Data to use in the new object. Must have same shape as original. - When `data` is used, `deep` is ignored. - - Returns - ------- - object : Variable - New object with dimensions, attributes, encodings, and optionally - data copied from original. - - Examples - -------- - Shallow copy versus deep copy - - >>> var = xr.Variable(data=[1, 2, 3], dims="x") - >>> var.copy() - - array([1, 2, 3]) - >>> var_0 = var.copy(deep=False) - >>> var_0[0] = 7 - >>> var_0 - - array([7, 2, 3]) - >>> var - - array([7, 2, 3]) - - Changing the data using the ``data`` argument maintains the - structure of the original object, but with the new data. Original - object is unaffected. - - >>> var.copy(data=[0.1, 0.2, 0.3]) - - array([0.1, 0.2, 0.3]) - >>> var - - array([7, 2, 3]) - - See Also - -------- - pandas.DataFrame.copy - """ - return self._copy(deep=deep, data=data) - def _copy( self, deep: bool = True, @@ -1111,57 +932,11 @@ def _replace( data = copy.copy(self.data) if attrs is _default: attrs = copy.copy(self._attrs) + if encoding is _default: encoding = copy.copy(self._encoding) return type(self)(dims, data, attrs, encoding, fastpath=True) - def __copy__(self) -> Self: - return self._copy(deep=False) - - def __deepcopy__(self, memo: dict[int, Any] | None = None) -> Self: - return self._copy(deep=True, memo=memo) - - # mutable objects should not be hashable - # https://github.com/python/mypy/issues/4266 - __hash__ = None # type: ignore[assignment] - - @property - def chunks(self) -> tuple[tuple[int, ...], ...] | None: - """ - Tuple of block lengths for this dataarray's data, in order of dimensions, or None if - the underlying data is not a dask array. - - See Also - -------- - Variable.chunk - Variable.chunksizes - xarray.unify_chunks - """ - return getattr(self._data, "chunks", None) - - @property - def chunksizes(self) -> Mapping[Any, tuple[int, ...]]: - """ - Mapping from dimension names to block lengths for this variable's data, or None if - the underlying data is not a dask array. - Cannot be modified directly, but can be modified by calling .chunk(). - - Differs from variable.chunks because it returns a mapping of dimensions to chunk shapes - instead of a tuple of chunk shapes. - - See Also - -------- - Variable.chunk - Variable.chunks - xarray.unify_chunks - """ - if hasattr(self._data, "chunks"): - return Frozen({dim: c for dim, c in zip(self.dims, self.data.chunks)}) - else: - return {} - - _array_counter = itertools.count() - def chunk( self, chunks: ( @@ -1312,36 +1087,6 @@ def as_numpy(self) -> Self: """Coerces wrapped data into a numpy array, returning a Variable.""" return self._replace(data=self.to_numpy()) - def _as_sparse(self, sparse_format=_default, fill_value=dtypes.NA): - """ - use sparse-array as backend. - """ - import sparse - - # TODO: what to do if dask-backended? - if fill_value is dtypes.NA: - dtype, fill_value = dtypes.maybe_promote(self.dtype) - else: - dtype = dtypes.result_type(self.dtype, fill_value) - - if sparse_format is _default: - sparse_format = "coo" - try: - as_sparse = getattr(sparse, f"as_{sparse_format.lower()}") - except AttributeError: - raise ValueError(f"{sparse_format} is not a valid sparse format") - - data = as_sparse(self.data.astype(dtype), fill_value=fill_value) - return self._replace(data=data) - - def _to_dense(self): - """ - Change backend from sparse to np.array - """ - if hasattr(self._data, "todense"): - return self._replace(data=self._data.todense()) - return self.copy(deep=False) - def isel( self, indexers: Mapping[Any, Any] | None = None, @@ -2649,28 +2394,6 @@ def notnull(self, keep_attrs: bool | None = None): keep_attrs=keep_attrs, ) - @property - def real(self): - """ - The real part of the variable. - - See Also - -------- - numpy.ndarray.real - """ - return self._replace(data=self.data.real) - - @property - def imag(self): - """ - The imaginary part of the variable. - - See Also - -------- - numpy.ndarray.imag - """ - return self._replace(data=self.data.imag) - def __array_wrap__(self, obj, context=None): return Variable(self.dims, obj) diff --git a/xarray/namedarray/__init__.py b/xarray/namedarray/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/xarray/namedarray/core.py b/xarray/namedarray/core.py new file mode 100644 index 00000000000..16a7b422f1b --- /dev/null +++ b/xarray/namedarray/core.py @@ -0,0 +1,447 @@ +from __future__ import annotations + +import copy +import math +import sys +import typing +from collections.abc import Hashable, Iterable, Mapping + +import numpy as np + +# TODO: get rid of this after migrating this class to array API +from xarray.core import dtypes +from xarray.core.indexing import ExplicitlyIndexed +from xarray.core.utils import Default, _default +from xarray.namedarray.utils import ( + T_DuckArray, + is_duck_array, + is_duck_dask_array, + to_0d_object_array, +) + +if typing.TYPE_CHECKING: + T_NamedArray = typing.TypeVar("T_NamedArray", bound="NamedArray") + DimsInput = typing.Union[str, Iterable[Hashable]] + Dims = tuple[Hashable, ...] + + +try: + if sys.version_info >= (3, 11): + from typing import Self + else: + from typing_extensions import Self +except ImportError: + if typing.TYPE_CHECKING: + raise + else: + Self: typing.Any = None + + +# TODO: Add tests! +def as_compatible_data( + data: T_DuckArray | np.typing.ArrayLike, fastpath: bool = False +) -> T_DuckArray: + if fastpath and getattr(data, "ndim", 0) > 0: + # can't use fastpath (yet) for scalars + return typing.cast(T_DuckArray, data) + + if isinstance(data, np.ma.MaskedArray): + mask = np.ma.getmaskarray(data) + if mask.any(): + # TODO: requires refactoring/vendoring xarray.core.dtypes and xarray.core.duck_array_ops + raise NotImplementedError("MaskedArray is not supported yet") + else: + return typing.cast(T_DuckArray, np.asarray(data)) + if is_duck_array(data): + return data + if isinstance(data, NamedArray): + return typing.cast(T_DuckArray, data.data) + + if isinstance(data, ExplicitlyIndexed): + # TODO: better that is_duck_array(ExplicitlyIndexed) -> True + return typing.cast(T_DuckArray, data) + + if isinstance(data, tuple): + data = to_0d_object_array(data) + + # validate whether the data is valid data types. + return typing.cast(T_DuckArray, np.asarray(data)) + + +class NamedArray: + + """A lightweight wrapper around duck arrays with named dimensions and attributes which describe a single Array. + Numeric operations on this object implement array broadcasting and dimension alignment based on dimension names, + rather than axis order.""" + + __slots__ = ("_dims", "_data", "_attrs") + + def __init__( + self, + dims: DimsInput, + data: T_DuckArray | np.typing.ArrayLike, + attrs: dict | None = None, + fastpath: bool = False, + ): + """ + Parameters + ---------- + dims : str or iterable of str + Name(s) of the dimension(s). + data : T_DuckArray or np.typing.ArrayLike + The actual data that populates the array. Should match the shape specified by `dims`. + attrs : dict, optional + A dictionary containing any additional information or attributes you want to store with the array. + Default is None, meaning no attributes will be stored. + fastpath : bool, optional + A flag to indicate if certain validations should be skipped for performance reasons. + Should only be True if you are certain about the integrity of the input data. + Default is False. + + Raises + ------ + ValueError + If the `dims` length does not match the number of data dimensions (ndim). + + + """ + self._data: T_DuckArray = as_compatible_data(data, fastpath=fastpath) + self._dims: Dims = self._parse_dimensions(dims) + self._attrs: dict | None = dict(attrs) if attrs else None + + @property + def ndim(self) -> int: + """ + Number of array dimensions. + + See Also + -------- + numpy.ndarray.ndim + """ + return len(self.shape) + + @property + def size(self) -> int: + """ + Number of elements in the array. + + Equal to ``np.prod(a.shape)``, i.e., the product of the array’s dimensions. + + See Also + -------- + numpy.ndarray.size + """ + return math.prod(self.shape) + + def __len__(self) -> int: + try: + return self.shape[0] + except Exception as exc: + raise TypeError("len() of unsized object") from exc + + @property + def dtype(self) -> np.dtype: + """ + Data-type of the array’s elements. + + See Also + -------- + ndarray.dtype + numpy.dtype + """ + return self._data.dtype + + @property + def shape(self) -> tuple[int, ...]: + """ + + + Returns + ------- + shape : tuple of ints + Tuple of array dimensions. + + + + See Also + -------- + numpy.ndarray.shape + """ + return self._data.shape + + @property + def nbytes(self) -> int: + """ + Total bytes consumed by the elements of the data array. + + If the underlying data array does not include ``nbytes``, estimates + the bytes consumed based on the ``size`` and ``dtype``. + """ + if hasattr(self._data, "nbytes"): + return self._data.nbytes + else: + return self.size * self.dtype.itemsize + + @property + def dims(self) -> Dims: + """Tuple of dimension names with which this NamedArray is associated.""" + return self._dims + + @dims.setter + def dims(self, value: DimsInput) -> None: + self._dims = self._parse_dimensions(value) + + def _parse_dimensions(self, dims: DimsInput) -> Dims: + dims = (dims,) if isinstance(dims, str) else tuple(dims) + if len(dims) != self.ndim: + raise ValueError( + f"dimensions {dims} must have the same length as the " + f"number of data dimensions, ndim={self.ndim}" + ) + return dims + + @property + def attrs(self) -> dict[typing.Any, typing.Any]: + """Dictionary of local attributes on this NamedArray.""" + if self._attrs is None: + self._attrs = {} + return self._attrs + + @attrs.setter + def attrs(self, value: Mapping) -> None: + self._attrs = dict(value) + + def _check_shape(self, new_data: T_DuckArray) -> None: + if new_data.shape != self.shape: + raise ValueError( + f"replacement data must match the {self.__class__.__name__}'s shape. " + f"replacement data has shape {new_data.shape}; {self.__class__.__name__} has shape {self.shape}" + ) + + @property + def data(self): + """ + The NamedArray's data as an array. The underlying array type + (e.g. dask, sparse, pint) is preserved. + + """ + + return self._data + + @data.setter + def data(self, data: T_DuckArray | np.typing.ArrayLike) -> None: + data = as_compatible_data(data) + self._check_shape(data) + self._data = data + + @property + def real(self) -> Self: + """ + The real part of the NamedArray. + + See Also + -------- + numpy.ndarray.real + """ + return self._replace(data=self.data.real) + + @property + def imag(self) -> Self: + """ + The imaginary part of the NamedArray. + + See Also + -------- + numpy.ndarray.imag + """ + return self._replace(data=self.data.imag) + + def __dask_tokenize__(self): + # Use v.data, instead of v._data, in order to cope with the wrappers + # around NetCDF and the like + from dask.base import normalize_token + + return normalize_token((type(self), self._dims, self.data, self.attrs)) + + def __dask_graph__(self): + return self._data.__dask_graph__() if is_duck_dask_array(self._data) else None + + def __dask_keys__(self): + return self._data.__dask_keys__() + + def __dask_layers__(self): + return self._data.__dask_layers__() + + @property + def __dask_optimize__(self) -> typing.Callable: + return self._data.__dask_optimize__ + + @property + def __dask_scheduler__(self) -> typing.Callable: + return self._data.__dask_scheduler__ + + def __dask_postcompute__( + self, + ) -> tuple[typing.Callable, tuple[typing.Any, ...]]: + array_func, array_args = self._data.__dask_postcompute__() + return self._dask_finalize, (array_func,) + array_args + + def __dask_postpersist__( + self, + ) -> tuple[typing.Callable, tuple[typing.Any, ...]]: + array_func, array_args = self._data.__dask_postpersist__() + return self._dask_finalize, (array_func,) + array_args + + def _dask_finalize(self, results, array_func, *args, **kwargs) -> Self: + data = array_func(results, *args, **kwargs) + return type(self)(self._dims, data, attrs=self._attrs) + + @property + def chunks(self) -> tuple[tuple[int, ...], ...] | None: + """ + Tuple of block lengths for this NamedArray's data, in order of dimensions, or None if + the underlying data is not a dask array. + + See Also + -------- + NamedArray.chunk + NamedArray.chunksizes + xarray.unify_chunks + """ + return getattr(self._data, "chunks", None) + + @property + def chunksizes( + self, + ) -> typing.Mapping[typing.Any, tuple[int, ...]]: + """ + Mapping from dimension names to block lengths for this namedArray's data, or None if + the underlying data is not a dask array. + Cannot be modified directly, but can be modified by calling .chunk(). + + Differs from NamedArray.chunks because it returns a mapping of dimensions to chunk shapes + instead of a tuple of chunk shapes. + + See Also + -------- + NamedArray.chunk + NamedArray.chunks + xarray.unify_chunks + """ + if hasattr(self._data, "chunks"): + return dict(zip(self.dims, self.data.chunks)) + else: + return {} + + @property + def sizes(self) -> dict[Hashable, int]: + """Ordered mapping from dimension names to lengths.""" + return dict(zip(self.dims, self.shape)) + + def _replace(self, dims=_default, data=_default, attrs=_default) -> Self: + if dims is _default: + dims = copy.copy(self._dims) + if data is _default: + data = copy.copy(self._data) + if attrs is _default: + attrs = copy.copy(self._attrs) + return type(self)(dims, data, attrs) + + def _copy( + self, + deep: bool = True, + data: T_DuckArray | np.typing.ArrayLike | None = None, + memo: dict[int, typing.Any] | None = None, + ) -> Self: + if data is None: + ndata = self._data + if deep: + ndata = copy.deepcopy(ndata, memo=memo) + else: + ndata = as_compatible_data(data) + self._check_shape(ndata) + + attrs = ( + copy.deepcopy(self._attrs, memo=memo) if deep else copy.copy(self._attrs) + ) + + return self._replace(data=ndata, attrs=attrs) + + def __copy__(self) -> Self: + return self._copy(deep=False) + + def __deepcopy__(self, memo: dict[int, typing.Any] | None = None) -> Self: + return self._copy(deep=True, memo=memo) + + def copy( + self, + deep: bool = True, + data: T_DuckArray | np.typing.ArrayLike | None = None, + ) -> Self: + """Returns a copy of this object. + + If `deep=True`, the data array is loaded into memory and copied onto + the new object. Dimensions, attributes and encodings are always copied. + + Use `data` to create a new object with the same structure as + original but entirely new data. + + Parameters + ---------- + deep : bool, default: True + Whether the data array is loaded into memory and copied onto + the new object. Default is True. + data : array_like, optional + Data to use in the new object. Must have same shape as original. + When `data` is used, `deep` is ignored. + + Returns + ------- + object : NamedArray + New object with dimensions, attributes, and optionally + data copied from original. + + + """ + return self._copy(deep=deep, data=data) + + def _nonzero(self) -> tuple[Self, ...]: + """Equivalent numpy's nonzero but returns a tuple of NamedArrays.""" + # TODO we should replace dask's native nonzero + # after https://github.com/dask/dask/issues/1076 is implemented. + nonzeros = np.nonzero(self.data) + return tuple(type(self)((dim,), nz) for nz, dim in zip(nonzeros, self.dims)) + + def _as_sparse( + self, + sparse_format: str | Default = _default, + fill_value=dtypes.NA, + ) -> Self: + """ + use sparse-array as backend. + """ + import sparse + + # TODO: what to do if dask-backended? + if fill_value is dtypes.NA: + dtype, fill_value = dtypes.maybe_promote(self.dtype) + else: + dtype = dtypes.result_type(self.dtype, fill_value) + + if sparse_format is _default: + sparse_format = "coo" + try: + as_sparse = getattr(sparse, f"as_{sparse_format.lower()}") + except AttributeError as exc: + raise ValueError(f"{sparse_format} is not a valid sparse format") from exc + + data = as_sparse(self.data.astype(dtype), fill_value=fill_value) + return self._replace(data=data) + + def _to_dense(self) -> Self: + """ + Change backend from sparse to np.array + """ + if hasattr(self._data, "todense"): + return self._replace(data=self._data.todense()) + return self.copy(deep=False) diff --git a/xarray/namedarray/utils.py b/xarray/namedarray/utils.py new file mode 100644 index 00000000000..1495e111d85 --- /dev/null +++ b/xarray/namedarray/utils.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import importlib +import sys +import typing + +import numpy as np + +if typing.TYPE_CHECKING: + if sys.version_info >= (3, 10): + from typing import TypeGuard + else: + from typing_extensions import TypeGuard + +# temporary placeholder for indicating an array api compliant type. +# hopefully in the future we can narrow this down more +T_DuckArray = typing.TypeVar("T_DuckArray", bound=typing.Any) + + +def module_available(module: str) -> bool: + """Checks whether a module is installed without importing it. + + Use this for a lightweight check and lazy imports. + + Parameters + ---------- + module : str + Name of the module. + + Returns + ------- + available : bool + Whether the module is installed. + """ + return importlib.util.find_spec(module) is not None + + +def is_dask_collection(x: typing.Any) -> bool: + if module_available("dask"): + from dask.base import is_dask_collection + + return is_dask_collection(x) + return False + + +def is_duck_array(value: typing.Any) -> TypeGuard[T_DuckArray]: + if isinstance(value, np.ndarray): + return True + return ( + hasattr(value, "ndim") + and hasattr(value, "shape") + and hasattr(value, "dtype") + and ( + (hasattr(value, "__array_function__") and hasattr(value, "__array_ufunc__")) + or hasattr(value, "__array_namespace__") + ) + ) + + +def is_duck_dask_array(x: typing.Any) -> bool: + return is_duck_array(x) and is_dask_collection(x) + + +def to_0d_object_array(value: typing.Any) -> np.ndarray: + """Given a value, wrap it in a 0-D numpy.ndarray with dtype=object.""" + result = np.empty((), dtype=object) + result[()] = value + return result diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 3fb29e01ebb..ac641c4abc3 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -411,10 +411,14 @@ def test_repr_nep18(self) -> None: class Array: def __init__(self): self.shape = (2,) + self.ndim = 1 self.dtype = np.dtype(np.float64) def __array_function__(self, *args, **kwargs): - pass + return NotImplemented + + def __array_ufunc__(self, *args, **kwargs): + return NotImplemented def __repr__(self): return "Custom\nArray" diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 7670b77322c..5ca134503e8 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -549,7 +549,7 @@ def _repr_inline_(self, width): return formatted - def __array_function__(self, *args, **kwargs): + def __array_namespace__(self, *args, **kwargs): return NotImplemented @property diff --git a/xarray/tests/test_namedarray.py b/xarray/tests/test_namedarray.py new file mode 100644 index 00000000000..0871a0c6fb9 --- /dev/null +++ b/xarray/tests/test_namedarray.py @@ -0,0 +1,165 @@ +import numpy as np +import pytest + +import xarray as xr +from xarray.namedarray.core import NamedArray, as_compatible_data +from xarray.namedarray.utils import T_DuckArray + + +@pytest.fixture +def random_inputs() -> np.ndarray: + return np.arange(3 * 4 * 5, dtype=np.float32).reshape((3, 4, 5)) + + +@pytest.mark.parametrize( + "input_data, expected_output", + [ + ([1, 2, 3], np.array([1, 2, 3])), + (np.array([4, 5, 6]), np.array([4, 5, 6])), + (NamedArray("time", np.array([1, 2, 3])), np.array([1, 2, 3])), + (2, np.array(2)), + ], +) +def test_as_compatible_data( + input_data: T_DuckArray, expected_output: T_DuckArray +) -> None: + output: T_DuckArray = as_compatible_data(input_data) + assert np.array_equal(output, expected_output) + + +def test_as_compatible_data_with_masked_array() -> None: + masked_array = np.ma.array([1, 2, 3], mask=[False, True, False]) + with pytest.raises(NotImplementedError): + as_compatible_data(masked_array) + + +def test_as_compatible_data_with_0d_object() -> None: + data = np.empty((), dtype=object) + data[()] = (10, 12, 12) + np.array_equal(as_compatible_data(data), data) + + +def test_as_compatible_data_with_explicitly_indexed(random_inputs) -> None: + # TODO: Make xr.core.indexing.ExplicitlyIndexed pass is_duck_array and remove this test. + class CustomArray(xr.core.indexing.NDArrayMixin): + def __init__(self, array): + self.array = array + + class CustomArrayIndexable(CustomArray, xr.core.indexing.ExplicitlyIndexed): + pass + + array = CustomArray(random_inputs) + output = as_compatible_data(array) + assert isinstance(output, np.ndarray) + + array = CustomArrayIndexable(random_inputs) + output = as_compatible_data(array) + assert isinstance(output, CustomArrayIndexable) + + +def test_properties() -> None: + data = 0.5 * np.arange(10).reshape(2, 5) + named_array = NamedArray(["x", "y"], data, {"key": "value"}) + assert named_array.dims == ("x", "y") + assert np.array_equal(named_array.data, data) + assert named_array.attrs == {"key": "value"} + assert named_array.ndim == 2 + assert named_array.sizes == {"x": 2, "y": 5} + assert named_array.size == 10 + assert named_array.nbytes == 80 + assert len(named_array) == 2 + + +def test_attrs() -> None: + named_array = NamedArray(["x", "y"], np.arange(10).reshape(2, 5)) + assert named_array.attrs == {} + named_array.attrs["key"] = "value" + assert named_array.attrs == {"key": "value"} + named_array.attrs = {"key": "value2"} + assert named_array.attrs == {"key": "value2"} + + +def test_data(random_inputs) -> None: + named_array = NamedArray(["x", "y", "z"], random_inputs) + assert np.array_equal(named_array.data, random_inputs) + with pytest.raises(ValueError): + named_array.data = np.random.random((3, 4)).astype(np.float64) + + +# Additional tests as per your original class-based code +@pytest.mark.parametrize( + "data, dtype", + [ + ("foo", np.dtype("U3")), + (np.bytes_("foo"), np.dtype("S3")), + ], +) +def test_0d_string(data, dtype: np.typing.DTypeLike) -> None: + named_array = NamedArray([], data) + assert named_array.data == data + assert named_array.dims == () + assert named_array.sizes == {} + assert named_array.attrs == {} + assert named_array.ndim == 0 + assert named_array.size == 1 + assert named_array.dtype == dtype + + +def test_0d_object() -> None: + named_array = NamedArray([], (10, 12, 12)) + expected_data = np.empty((), dtype=object) + expected_data[()] = (10, 12, 12) + assert np.array_equal(named_array.data, expected_data) + + assert named_array.dims == () + assert named_array.sizes == {} + assert named_array.attrs == {} + assert named_array.ndim == 0 + assert named_array.size == 1 + assert named_array.dtype == np.dtype("O") + + +def test_0d_datetime() -> None: + named_array = NamedArray([], np.datetime64("2000-01-01")) + assert named_array.dtype == np.dtype("datetime64[D]") + + +@pytest.mark.parametrize( + "timedelta, expected_dtype", + [ + (np.timedelta64(1, "D"), np.dtype("timedelta64[D]")), + (np.timedelta64(1, "s"), np.dtype("timedelta64[s]")), + (np.timedelta64(1, "m"), np.dtype("timedelta64[m]")), + (np.timedelta64(1, "h"), np.dtype("timedelta64[h]")), + (np.timedelta64(1, "us"), np.dtype("timedelta64[us]")), + (np.timedelta64(1, "ns"), np.dtype("timedelta64[ns]")), + (np.timedelta64(1, "ps"), np.dtype("timedelta64[ps]")), + (np.timedelta64(1, "fs"), np.dtype("timedelta64[fs]")), + (np.timedelta64(1, "as"), np.dtype("timedelta64[as]")), + ], +) +def test_0d_timedelta(timedelta, expected_dtype: np.dtype) -> None: + named_array = NamedArray([], timedelta) + assert named_array.dtype == expected_dtype + assert named_array.data == timedelta + + +@pytest.mark.parametrize( + "dims, data_shape, new_dims, raises", + [ + (["x", "y", "z"], (2, 3, 4), ["a", "b", "c"], False), + (["x", "y", "z"], (2, 3, 4), ["a", "b"], True), + (["x", "y", "z"], (2, 4, 5), ["a", "b", "c", "d"], True), + ([], [], (), False), + ([], [], ("x",), True), + ], +) +def test_dims_setter(dims, data_shape, new_dims, raises: bool) -> None: + named_array = NamedArray(dims, np.random.random(data_shape)) + assert named_array.dims == tuple(dims) + if raises: + with pytest.raises(ValueError): + named_array.dims = new_dims + else: + named_array.dims = new_dims + assert named_array.dims == tuple(new_dims)