Merge branch 'main' into namedarray-ops

headtr1ck · Oct 26, 2023 · 7ac44e0 · 7ac44e0
2 parents 657e7af + bb489fa
commit 7ac44e0
Show file tree

Hide file tree

Showing 53 changed files with 3,030 additions and 679 deletions.
diff --git a/asv_bench/benchmarks/dataset.py b/asv_bench/benchmarks/dataset.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+from xarray import Dataset
+
+from . import requires_dask
+
+
+class DatasetBinaryOp:
+    def setup(self):
+        self.ds = Dataset(
+            {
+                "a": (("x", "y"), np.ones((300, 400))),
+                "b": (("x", "y"), np.ones((300, 400))),
+            }
+        )
+        self.mean = self.ds.mean()
+        self.std = self.ds.std()
+
+    def time_normalize(self):
+        (self.ds - self.mean) / self.std
+
+
+class DatasetChunk:
+    def setup(self):
+        requires_dask()
+        self.ds = Dataset()
+        array = np.ones(1000)
+        for i in range(250):
+            self.ds[f"var{i}"] = ("x", array)
+
+    def time_chunk(self):
+        self.ds.chunk(x=(1,) * 1000)
diff --git a/asv_bench/benchmarks/dataset_io.py b/asv_bench/benchmarks/dataset_io.py
@@ -593,7 +593,7 @@ def load(self) -> tuple:
                 n_variables = 2000
 
                 # Important to have a shape and dtype for lazy loading.
-                shape = (1,)
+                shape = (1000,)
                 dtype = np.dtype(int)
                 variables = {
                     f"long_variable_name_{v}": xr.Variable(
@@ -643,7 +643,7 @@ def open_dataset(
 
         self.engine = PerformanceBackend
 
-    @parameterized(["chunks"], ([None, {}]))
+    @parameterized(["chunks"], ([None, {}, {"time": 10}]))
     def time_open_dataset(self, chunks):
         """
         Time how fast xr.open_dataset is without the slow data reading part.

diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml
@@ -34,6 +34,7 @@ dependencies:
   - sphinx-book-theme >= 0.3.0
   - sphinx-copybutton
   - sphinx-design
+  - sphinx-inline-tabs
   - sphinx>=5.0
   - zarr>=2.10
   - pip:

diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst
@@ -351,6 +351,36 @@
    IndexVariable.sizes
    IndexVariable.values
 
+
+   namedarray.core.NamedArray.all
+   namedarray.core.NamedArray.any
+   namedarray.core.NamedArray.attrs
+   namedarray.core.NamedArray.chunks
+   namedarray.core.NamedArray.chunksizes
+   namedarray.core.NamedArray.copy
+   namedarray.core.NamedArray.count
+   namedarray.core.NamedArray.cumprod
+   namedarray.core.NamedArray.cumsum
+   namedarray.core.NamedArray.data
+   namedarray.core.NamedArray.dims
+   namedarray.core.NamedArray.dtype
+   namedarray.core.NamedArray.get_axis_num
+   namedarray.core.NamedArray.max
+   namedarray.core.NamedArray.mean
+   namedarray.core.NamedArray.median
+   namedarray.core.NamedArray.min
+   namedarray.core.NamedArray.nbytes
+   namedarray.core.NamedArray.ndim
+   namedarray.core.NamedArray.prod
+   namedarray.core.NamedArray.reduce
+   namedarray.core.NamedArray.shape
+   namedarray.core.NamedArray.size
+   namedarray.core.NamedArray.sizes
+   namedarray.core.NamedArray.std
+   namedarray.core.NamedArray.sum
+   namedarray.core.NamedArray.var
+
+
    plot.plot
    plot.line
    plot.step

diff --git a/doc/conf.py b/doc/conf.py
@@ -84,6 +84,7 @@
     "sphinx_copybutton",
     "sphinxext.rediraffe",
     "sphinx_design",
+    "sphinx_inline_tabs",
 ]
 
 
@@ -230,6 +231,7 @@
     # canonical_url="",
     repository_url="https://github.com/pydata/xarray",
     repository_branch="main",
+    navigation_with_keys=False,  # pydata/pydata-sphinx-theme#1492
     path_to_docs="doc",
     use_edit_page_button=True,
     use_repository_button=True,

diff --git a/doc/examples/apply_ufunc_vectorize_1d.ipynb b/doc/examples/apply_ufunc_vectorize_1d.ipynb
@@ -11,7 +11,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This example will illustrate how to conveniently apply an unvectorized function `func` to xarray objects using `apply_ufunc`. `func` expects 1D numpy arrays and returns a 1D numpy array. Our goal is to coveniently apply this function along a dimension of xarray objects that may or may not wrap dask arrays with a signature.\n",
+    "This example will illustrate how to conveniently apply an unvectorized function `func` to xarray objects using `apply_ufunc`. `func` expects 1D numpy arrays and returns a 1D numpy array. Our goal is to conveniently apply this function along a dimension of xarray objects that may or may not wrap dask arrays with a signature.\n",
     "\n",
     "We will illustrate this using `np.interp`: \n",
     "\n",

diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst
@@ -9,7 +9,8 @@ to integrate any code in Xarray; all you need to do is:
 - Create a class that inherits from Xarray :py:class:`~xarray.backends.BackendEntrypoint`
   and implements the method ``open_dataset`` see :ref:`RST backend_entrypoint`
 
-- Declare this class as an external plugin in your ``setup.py``, see :ref:`RST backend_registration`
+- Declare this class as an external plugin in your project configuration, see :ref:`RST
+  backend_registration`
 
 If you also want to support lazy loading and dask see :ref:`RST lazy_loading`.
 
@@ -267,42 +268,57 @@ interface only the boolean keywords related to the supported decoders.
 How to register a backend
 +++++++++++++++++++++++++
 
-Define a new entrypoint in your ``setup.py`` (or ``setup.cfg``) with:
+Define a new entrypoint in your ``pyproject.toml`` (or ``setup.cfg/setup.py`` for older
+configurations), with:
 
 - group: ``xarray.backends``
 - name: the name to be passed to :py:meth:`~xarray.open_dataset`  as ``engine``
 - object reference: the reference of the class that you have implemented.
 
-You can declare the entrypoint in ``setup.py`` using the following syntax:
+You can declare the entrypoint in your project configuration like so:
 
-.. code-block::
+.. tab:: pyproject.toml
 
-    setuptools.setup(
-        entry_points={
-            "xarray.backends": ["my_engine=my_package.my_module:MyBackendEntryClass"],
-        },
-    )
+   .. code:: toml
+
+      [project.entry-points."xarray-backends"]
+      my_engine = "my_package.my_module:MyBackendEntrypoint"
+
+.. tab:: pyproject.toml [Poetry]
+
+   .. code-block:: toml
+
+       [tool.poetry.plugins."xarray.backends"]
+       my_engine = "my_package.my_module:MyBackendEntrypoint"
 
-in ``setup.cfg``:
+.. tab:: setup.cfg
 
-.. code-block:: cfg
+   .. code-block:: cfg
 
-    [options.entry_points]
-    xarray.backends =
-        my_engine = my_package.my_module:MyBackendEntryClass
+       [options.entry_points]
+       xarray.backends =
+           my_engine = my_package.my_module:MyBackendEntrypoint
 
+.. tab:: setup.py
 
-See https://packaging.python.org/specifications/entry-points/#data-model
-for more information
+   .. code-block::
 
-If you are using `Poetry <https://python-poetry.org/>`_ for your build system, you can accomplish the same thing using "plugins". In this case you would need to add the following to your ``pyproject.toml`` file:
+       setuptools.setup(
+           entry_points={
+               "xarray.backends": [
+                   "my_engine=my_package.my_module:MyBackendEntrypoint"
+               ],
+           },
+       )
 
-.. code-block:: toml
 
-    [tool.poetry.plugins."xarray.backends"]
-    "my_engine" = "my_package.my_module:MyBackendEntryClass"
+See the `Python Packaging User Guide
+<https://packaging.python.org/specifications/entry-points/#data-model>`_ for more
+information on entrypoints and details of the syntax.
 
-See https://python-poetry.org/docs/pyproject/#plugins for more information on Poetry plugins.
+If you're using Poetry, note that table name in ``pyproject.toml`` is slightly different.
+See `the Poetry docs <https://python-poetry.org/docs/pyproject/#plugins>`_ for more
+information on plugins.
 
 .. _RST lazy_loading:
 

diff --git a/doc/internals/how-to-create-custom-index.rst b/doc/internals/how-to-create-custom-index.rst
@@ -1,5 +1,7 @@
 .. currentmodule:: xarray
 
+.. _internals.custom indexes:
+
 How to create a custom index
 ============================
 

diff --git a/doc/internals/index.rst b/doc/internals/index.rst
@@ -19,9 +19,10 @@ The pages in this section are intended for:
    :hidden:
 
    internal-design
+   interoperability
    duck-arrays-integration
    chunked-arrays
    extending-xarray
-   zarr-encoding-spec
    how-to-add-new-backend
    how-to-create-custom-index
+   zarr-encoding-spec
diff --git a/doc/internals/internal-design.rst b/doc/internals/internal-design.rst
@@ -59,9 +59,9 @@ which is used as the basic building block behind xarray's
 - ``data``: The N-dimensional array (typically a NumPy or Dask array) storing
   the Variable's data. It must have the same number of dimensions as the length
   of ``dims``.
-- ``attrs``: An ordered dictionary of metadata associated with this array. By
+- ``attrs``: A dictionary of metadata associated with this array. By
   convention, xarray's built-in operations never use this metadata.
-- ``encoding``: Another ordered dictionary used to store information about how
+- ``encoding``: Another dictionary used to store information about how
   these variable's data is represented on disk. See :ref:`io.encoding` for more
   details.
 
@@ -95,7 +95,7 @@ all of which are implemented by forwarding on to the underlying ``Variable`` obj
 
 In addition, a :py:class:`~xarray.DataArray` stores additional ``Variable`` objects stored in a dict under the private ``_coords`` attribute,
 each of which is referred to as a "Coordinate Variable". These coordinate variable objects are only allowed to have ``dims`` that are a subset of the data variable's ``dims``,
-and each dim has a specific length. This means that the full :py:attr:`~xarray.DataArray.sizes` of the dataarray can be represented by a dictionary mapping dimension names to integer sizes.
+and each dim has a specific length. This means that the full :py:attr:`~xarray.DataArray.size` of the dataarray can be represented by a dictionary mapping dimension names to integer sizes.
 The underlying data variable has this exact same size, and the attached coordinate variables have sizes which are some subset of the size of the data variable.
 Another way of saying this is that all coordinate variables must be "alignable" with the data variable.
 
@@ -124,7 +124,7 @@ The :py:class:`~xarray.Dataset` class is a generalization of the :py:class:`~xar
 Internally all data variables and coordinate variables are stored under a single ``variables`` dict, and coordinates are
 specified by storing their names in a private ``_coord_names`` dict.
 
-The dataset's dimensions are the set of all dims present across any variable, but (similar to in dataarrays) coordinate
+The dataset's ``dims`` are the set of all dims present across any variable, but (similar to in dataarrays) coordinate
 variables cannot have a dimension that is not present on any data variable.
 
 When a data variable or coordinate variable is accessed, a new ``DataArray`` is again constructed from all compatible

diff --git a/doc/internals/interoperability.rst b/doc/internals/interoperability.rst
@@ -0,0 +1,45 @@
+.. _interoperability:
+
+Interoperability of Xarray
+==========================
+
+Xarray is designed to be extremely interoperable, in many orthogonal ways.
+Making xarray as flexible as possible is the common theme of most of the goals on our :ref:`roadmap`.
+
+This interoperability comes via a set of flexible abstractions into which the user can plug in. The current full list is:
+
+- :ref:`Custom file backends <add_a_backend>` via the :py:class:`~xarray.backends.BackendEntrypoint` system,
+- Numpy-like :ref:`"duck" array wrapping <internals.duckarrays>`, which supports the `Python Array API Standard <https://data-apis.org/array-api/latest/>`_,
+- :ref:`Chunked distributed array computation <internals.chunkedarrays>` via the :py:class:`~xarray.core.parallelcompat.ChunkManagerEntrypoint` system,
+- Custom :py:class:`~xarray.Index` objects for :ref:`flexible label-based lookups <internals.custom indexes>`,
+- Extending xarray objects with domain-specific methods via :ref:`custom accessors <internals.accessors>`.
+
+.. warning::
+
+    One obvious way in which xarray could be more flexible is that whilst subclassing xarray objects is possible, we
+    currently don't support it in most transformations, instead recommending composition over inheritance. See the
+    :ref:`internal design page <internal design.subclassing>` for the rationale and look at the corresponding `GH issue <https://github.com/pydata/xarray/issues/3980>`_
+    if you're interested in improving support for subclassing!
+
+.. note::
+
+    If you think there is another way in which xarray could become more generically flexible then please
+    tell us your ideas by `raising an issue to request the feature <https://github.com/pydata/xarray/issues/new/choose>`_!
+
+
+Whilst xarray was originally designed specifically to open ``netCDF4`` files as :py:class:`numpy.ndarray` objects labelled by :py:class:`pandas.Index` objects,
+it is entirely possible today to:
+
+- lazily open an xarray object directly from a custom binary file format (e.g. using ``xarray.open_dataset(path, engine='my_custom_format')``,
+- handle the data as any API-compliant numpy-like array type (e.g. sparse or GPU-backed),
+- distribute out-of-core computation across that array type in parallel (e.g. via :ref:`dask`),
+- track the physical units of the data through computations (e.g via `pint-xarray <https://pint-xarray.readthedocs.io/en/stable/>`_),
+- query the data via custom index logic optimized for specific applications (e.g. an :py:class:`~xarray.Index` object backed by a KDTree structure),
+- attach domain-specific logic via accessor methods (e.g. to understand geographic Coordinate Reference System metadata),
+- organize hierarchical groups of xarray data in a :py:class:`~datatree.DataTree` (e.g. to treat heterogenous simulation and observational data together during analysis).
+
+All of these features can be provided simultaneously, using libaries compatible with the rest of the scientific python ecosystem.
+In this situation xarray would be essentially a thin wrapper acting as pure-python framework, providing a common interface and
+separation of concerns via various domain-agnostic abstractions.
+
+Most of the remaining pages in the documentation of xarray's internals describe these various types of interoperability in more detail.
diff --git a/doc/user-guide/io.rst b/doc/user-guide/io.rst
@@ -819,7 +819,7 @@ with ``mode='a'`` on a Dataset containing the new variables, passing in an
 existing Zarr store or path to a Zarr store.
 
 To resize and then append values along an existing dimension in a store, set
-``append_dim``. This is a good option if data always arives in a particular
+``append_dim``. This is a good option if data always arrives in a particular
 order, e.g., for time-stepping a simulation:
 
 .. ipython:: python

diff --git a/doc/user-guide/terminology.rst b/doc/user-guide/terminology.rst
@@ -90,7 +90,7 @@ complete examples, please consult the relevant documentation.*
         dimensions although in most cases it is also a :term:`Dimension
         coordinate`. It may or may not be grouped with other indexed coordinates
         depending on whether they share the same index. Indexed coordinates are
-        marked by ``*`` when printing a ``DataArray`` or ``Dataset``.
+        marked by an asterisk ``*`` when printing a ``DataArray`` or ``Dataset``.
 
     Non-indexed coordinate
         A coordinate which has no associated :term:`Index`. It may still