Merge remote-tracking branch 'upstream/master' into dataset/quiver

* upstream/master: speed up the repr for big MultiIndex objects (pydata#4846) dim -> coord in DataArray.integrate (pydata#3993) WIP: backend interface, now it uses subclassing (pydata#4836) weighted: small improvements (pydata#4818) Update related-projects.rst (pydata#4844) iris update doc url (pydata#4845) Faster unstacking (pydata#4746) Allow swap_dims to take kwargs (pydata#4841) Move skip ci instructions to contributing guide (pydata#4829) fix issues in drop_sel and drop_isel (pydata#4828) Bugfix in list_engine (pydata#4811) Add drop_isel (pydata#4819) Fix RST. Remove the references to `_file_obj` outside low level code paths, change to `_close` (pydata#4809)
dcherian · Jan 29, 2021 · e0f227f · e0f227f
2 parents e795672 + 39048f9
commit e0f227f
Show file tree

Hide file tree

Showing 34 changed files with 1,065 additions and 601 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -5,11 +5,3 @@
 - [ ] Passes `pre-commit run --all-files`
 - [ ] User visible changes (including notable bug fixes) are documented in `whats-new.rst`
 - [ ] New functions/methods are listed in `api.rst`
-
-
-<sub>
-<h3>
-  Overriding CI behaviors
-</h3>
-   By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a <tt>[skip-ci]</tt> tag to the first line of the commit message
-</sub>
diff --git a/asv_bench/benchmarks/repr.py b/asv_bench/benchmarks/repr.py
@@ -0,0 +1,18 @@
+import pandas as pd
+
+import xarray as xr
+
+
+class ReprMultiIndex:
+    def setup(self, key):
+        index = pd.MultiIndex.from_product(
+            [range(10000), range(10000)], names=("level_0", "level_1")
+        )
+        series = pd.Series(range(100000000), index=index)
+        self.da = xr.DataArray(series)
+
+    def time_repr(self):
+        repr(self.da)
+
+    def time_repr_html(self):
+        self.da._repr_html_()
diff --git a/asv_bench/benchmarks/unstacking.py b/asv_bench/benchmarks/unstacking.py
@@ -7,18 +7,23 @@
 
 class Unstacking:
     def setup(self):
-        data = np.random.RandomState(0).randn(1, 1000, 500)
-        self.ds = xr.DataArray(data).stack(flat_dim=["dim_1", "dim_2"])
+        data = np.random.RandomState(0).randn(500, 1000)
+        self.da_full = xr.DataArray(data, dims=list("ab")).stack(flat_dim=[...])
+        self.da_missing = self.da_full[:-1]
+        self.df_missing = self.da_missing.to_pandas()
 
     def time_unstack_fast(self):
-        self.ds.unstack("flat_dim")
+        self.da_full.unstack("flat_dim")
 
     def time_unstack_slow(self):
-        self.ds[:, ::-1].unstack("flat_dim")
+        self.da_missing.unstack("flat_dim")
+
+    def time_unstack_pandas_slow(self):
+        self.df_missing.unstack()
 
 
 class UnstackingDask(Unstacking):
     def setup(self, *args, **kwargs):
         requires_dask()
         super().setup(**kwargs)
-        self.ds = self.ds.chunk({"flat_dim": 50})
+        self.da_full = self.da_full.chunk({"flat_dim": 50})
diff --git a/doc/api.rst b/doc/api.rst
@@ -126,6 +126,7 @@ Indexing
    Dataset.isel
    Dataset.sel
    Dataset.drop_sel
+   Dataset.drop_isel
    Dataset.head
    Dataset.tail
    Dataset.thin
@@ -308,6 +309,7 @@ Indexing
    DataArray.isel
    DataArray.sel
    DataArray.drop_sel
+   DataArray.drop_isel
    DataArray.head
    DataArray.tail
    DataArray.thin

diff --git a/doc/conf.py b/doc/conf.py
@@ -411,7 +411,7 @@
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3/", None),
     "pandas": ("https://pandas.pydata.org/pandas-docs/stable", None),
-    "iris": ("https://scitools.org.uk/iris/docs/latest", None),
+    "iris": ("https://scitools-iris.readthedocs.io/en/latest", None),
     "numpy": ("https://numpy.org/doc/stable", None),
     "scipy": ("https://docs.scipy.org/doc/scipy/reference", None),
     "numba": ("https://numba.pydata.org/numba-doc/latest", None),

diff --git a/doc/contributing.rst b/doc/contributing.rst
@@ -836,6 +836,7 @@ PR checklist
 
     - Write new tests if needed. See `"Test-driven development/code writing" <https://xarray.pydata.org/en/stable/contributing.html#test-driven-development-code-writing>`_.
     - Test the code using `Pytest <http://doc.pytest.org/en/latest/>`_. Running all tests (type ``pytest`` in the root directory) takes a while, so feel free to only run the tests you think are needed based on your PR (example: ``pytest xarray/tests/test_dataarray.py``). CI will catch any failing tests.
+    - By default, the upstream dev CI is disabled on pull request and push events. You can override this behavior per commit by adding a <tt>[test-upstream]</tt> tag to the first line of the commit message. For documentation-only commits, you can skip the CI per commit by adding a "[skip-ci]" tag to the first line of the commit message.
 
 - **Properly format your code** and verify that it passes the formatting guidelines set by `Black <https://black.readthedocs.io/en/stable/>`_ and `Flake8 <http://flake8.pycqa.org/en/latest/>`_. See `"Code formatting" <https://xarray.pydata.org/en/stablcontributing.html#code-formatting>`_. You can use `pre-commit <https://pre-commit.com/>`_ to run these automatically on each commit.
 

diff --git a/doc/faq.rst b/doc/faq.rst
@@ -166,7 +166,7 @@ different approaches to handling metadata: Iris strictly interprets
 `CF conventions`_. Iris particularly shines at mapping, thanks to its
 integration with Cartopy_.
 
-.. _Iris: http://scitools.org.uk/iris/
+.. _Iris: https://scitools-iris.readthedocs.io/en/stable/
 .. _Cartopy: http://scitools.org.uk/cartopy/docs/latest/
 
 `UV-CDAT`__ is another Python library that implements in-memory netCDF-like

diff --git a/doc/related-projects.rst b/doc/related-projects.rst
@@ -15,6 +15,7 @@ Geosciences
 - `aospy <https://aospy.readthedocs.io>`_: Automated analysis and management of gridded climate data.
 - `climpred <https://climpred.readthedocs.io>`_: Analysis of ensemble forecast models for climate prediction.
 - `geocube <https://corteva.github.io/geocube>`_: Tool to convert geopandas vector data into rasterized xarray data.
+- `GeoWombat <https://github.com/jgrss/geowombat>`_: Utilities for analysis of remotely sensed and gridded raster data at scale (easily tame Landsat, Sentinel, Quickbird, and PlanetScope).
 - `infinite-diff <https://github.com/spencerahill/infinite-diff>`_: xarray-based finite-differencing, focused on gridded climate/meterology data
 - `marc_analysis <https://github.com/darothen/marc_analysis>`_: Analysis package for CESM/MARC experiments and output.
 - `MetPy <https://unidata.github.io/MetPy/dev/index.html>`_: A collection of tools in Python for reading, visualizing, and performing calculations with weather data.

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -17,7 +17,7 @@ What's New
 
 .. _whats-new.0.16.3:
 
-v0.16.3 (unreleased)
+v0.17.0 (unreleased)
 --------------------
 
 Breaking changes
@@ -39,16 +39,32 @@ Breaking changes
   always be set such that ``int64`` values can be used.  In the past, no units
   finer than "seconds" were chosen, which would sometimes mean that ``float64``
   values were required, which would lead to inaccurate I/O round-trips.
-- remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull: `4725`).
-  By `Aureliana Barghini <https://github.com/aurghs>`_
+- remove deprecated ``autoclose`` kwargs from :py:func:`open_dataset` (:pull:`4725`).
+  By `Aureliana Barghini <https://github.com/aurghs>`_.
+
+Deprecations
+~~~~~~~~~~~~
+
+- ``dim`` argument to :py:meth:`DataArray.integrate` is being deprecated in
+  favour of a ``coord`` argument, for consistency with :py:meth:`Dataset.integrate`.
+  For now using ``dim`` issues a ``FutureWarning``. By `Tom Nicholas <https://github.com/TomNicholas>`_.
 
 
 New Features
 ~~~~~~~~~~~~
+- Significantly higher ``unstack`` performance on numpy-backed arrays which
+  contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
+  (:pull:`4746`);
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
+
 - Performance improvement when constructing DataArrays. Significantly speeds up repr for Datasets with large number of variables.
   By `Deepak Cherian <https://github.com/dcherian>`_
 - Add :py:meth:`Dataset.plot.quiver` for quiver plots with :py:class:`Dataset` variables.
   By `Deepak Cherian <https://github.com/dcherian>`_
+  By `Deepak Cherian <https://github.com/dcherian>`_.
+- :py:meth:`DataArray.swap_dims` & :py:meth:`Dataset.swap_dims` now accept dims
+  in the form of kwargs as well as a dict, like most similar methods.
+  By `Maximilian Roos <https://github.com/max-sixty>`_.
 
 Bug fixes
 ~~~~~~~~~
@@ -82,6 +98,7 @@ Bug fixes
 - Expand user directory paths (e.g. ``~/``) in :py:func:`open_mfdataset` and
   :py:meth:`Dataset.to_zarr` (:issue:`4783`, :pull:`4795`).
   By `Julien Seguinot <https://github.com/juseg>`_.
+- Add :py:meth:`Dataset.drop_isel` and :py:meth:`DataArray.drop_isel` (:issue:`4658`, :pull:`4819`). By `Daniel Mesejo <https://github.com/mesejo>`_.
 
 Documentation
 ~~~~~~~~~~~~~
@@ -110,6 +127,8 @@ Internal Changes
   By `Maximilian Roos <https://github.com/max-sixty>`_.
 - Speed up attribute style access (e.g. ``ds.somevar`` instead of ``ds["somevar"]``) and tab completion
   in ipython (:issue:`4741`, :pull:`4742`). By `Richard Kleijn <https://github.com/rhkleijn>`_.
+- Added the ``set_close`` method to ``Dataset`` and ``DataArray`` for beckends to specify how to voluntary release
+  all resources. (:pull:`#4809`), By `Alessandro Amici <https://github.com/alexamici>`_.
 
 .. _whats-new.0.16.2:
 

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -522,7 +522,7 @@ def maybe_decode_store(store, chunks):
 
         else:
             ds2 = ds
-        ds2._file_obj = ds._file_obj
+        ds2.set_close(ds._close)
         return ds2
 
     filename_or_obj = _normalize_path(filename_or_obj)
@@ -701,7 +701,7 @@ def open_dataarray(
     else:
         (data_array,) = dataset.data_vars.values()
 
-    data_array._file_obj = dataset._file_obj
+    data_array.set_close(dataset._close)
 
     # Reset names if they were changed during saving
     # to ensure that we can 'roundtrip' perfectly
@@ -715,17 +715,6 @@ def open_dataarray(
     return data_array
 
 
-class _MultiFileCloser:
-    __slots__ = ("file_objs",)
-
-    def __init__(self, file_objs):
-        self.file_objs = file_objs
-
-    def close(self):
-        for f in self.file_objs:
-            f.close()
-
-
 def open_mfdataset(
     paths,
     chunks=None,
@@ -918,14 +907,14 @@ def open_mfdataset(
         getattr_ = getattr
 
     datasets = [open_(p, **open_kwargs) for p in paths]
-    file_objs = [getattr_(ds, "_file_obj") for ds in datasets]
+    closers = [getattr_(ds, "_close") for ds in datasets]
     if preprocess is not None:
         datasets = [preprocess(ds) for ds in datasets]
 
     if parallel:
         # calling compute here will return the datasets/file_objs lists,
         # the underlying datasets will still be stored as dask arrays
-        datasets, file_objs = dask.compute(datasets, file_objs)
+        datasets, closers = dask.compute(datasets, closers)
 
     # Combine all datasets, closing them in case of a ValueError
     try:
@@ -963,7 +952,11 @@ def open_mfdataset(
             ds.close()
         raise
 
-    combined._file_obj = _MultiFileCloser(file_objs)
+    def multi_file_closer():
+        for closer in closers:
+            closer()
+
+    combined.set_close(multi_file_closer)
 
     # read global attributes from the attrs_file or from the first dataset
     if attrs_file is not None:

diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py
@@ -90,7 +90,7 @@ def _dataset_from_backend_dataset(
             **extra_tokens,
         )
 
-    ds._file_obj = backend_ds._file_obj
+    ds.set_close(backend_ds._close)
 
     # Ensure source filename always stored in dataset object (GH issue #2550)
     if "source" not in ds.encoding:

diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py
@@ -5,9 +5,22 @@
 from ..core import indexing
 from ..core.utils import Frozen, FrozenDict, close_on_error
 from ..core.variable import Variable
-from .common import AbstractDataStore, BackendArray, BackendEntrypoint
+from .common import (
+    BACKEND_ENTRYPOINTS,
+    AbstractDataStore,
+    BackendArray,
+    BackendEntrypoint,
+)
 from .locks import SerializableLock, ensure_lock
-from .store import open_backend_dataset_store
+from .store import StoreBackendEntrypoint
+
+try:
+    import cfgrib
+
+    has_cfgrib = True
+except ModuleNotFoundError:
+    has_cfgrib = False
+
 
 # FIXME: Add a dedicated lock, even if ecCodes is supposed to be thread-safe
 #   in most circumstances. See:
@@ -38,7 +51,6 @@ class CfGribDataStore(AbstractDataStore):
     """
 
     def __init__(self, filename, lock=None, **backend_kwargs):
-        import cfgrib
 
         if lock is None:
             lock = ECCODES_LOCK
@@ -74,58 +86,58 @@ def get_encoding(self):
         return encoding
 
 
-def guess_can_open_cfgrib(store_spec):
-    try:
-        _, ext = os.path.splitext(store_spec)
-    except TypeError:
-        return False
-    return ext in {".grib", ".grib2", ".grb", ".grb2"}
-
-
-def open_backend_dataset_cfgrib(
-    filename_or_obj,
-    *,
-    mask_and_scale=True,
-    decode_times=None,
-    concat_characters=None,
-    decode_coords=None,
-    drop_variables=None,
-    use_cftime=None,
-    decode_timedelta=None,
-    lock=None,
-    indexpath="{path}.{short_hash}.idx",
-    filter_by_keys={},
-    read_keys=[],
-    encode_cf=("parameter", "time", "geography", "vertical"),
-    squeeze=True,
-    time_dims=("time", "step"),
-):
-
-    store = CfGribDataStore(
+class CfgribfBackendEntrypoint(BackendEntrypoint):
+    def guess_can_open(self, store_spec):
+        try:
+            _, ext = os.path.splitext(store_spec)
+        except TypeError:
+            return False
+        return ext in {".grib", ".grib2", ".grb", ".grb2"}
+
+    def open_dataset(
+        self,
         filename_or_obj,
-        indexpath=indexpath,
-        filter_by_keys=filter_by_keys,
-        read_keys=read_keys,
-        encode_cf=encode_cf,
-        squeeze=squeeze,
-        time_dims=time_dims,
-        lock=lock,
-    )
-
-    with close_on_error(store):
-        ds = open_backend_dataset_store(
-            store,
-            mask_and_scale=mask_and_scale,
-            decode_times=decode_times,
-            concat_characters=concat_characters,
-            decode_coords=decode_coords,
-            drop_variables=drop_variables,
-            use_cftime=use_cftime,
-            decode_timedelta=decode_timedelta,
+        *,
+        mask_and_scale=True,
+        decode_times=None,
+        concat_characters=None,
+        decode_coords=None,
+        drop_variables=None,
+        use_cftime=None,
+        decode_timedelta=None,
+        lock=None,
+        indexpath="{path}.{short_hash}.idx",
+        filter_by_keys={},
+        read_keys=[],
+        encode_cf=("parameter", "time", "geography", "vertical"),
+        squeeze=True,
+        time_dims=("time", "step"),
+    ):
+
+        store = CfGribDataStore(
+            filename_or_obj,
+            indexpath=indexpath,
+            filter_by_keys=filter_by_keys,
+            read_keys=read_keys,
+            encode_cf=encode_cf,
+            squeeze=squeeze,
+            time_dims=time_dims,
+            lock=lock,
         )
-    return ds
-
-
-cfgrib_backend = BackendEntrypoint(
-    open_dataset=open_backend_dataset_cfgrib, guess_can_open=guess_can_open_cfgrib
-)
+        store_entrypoint = StoreBackendEntrypoint()
+        with close_on_error(store):
+            ds = store_entrypoint.open_dataset(
+                store,
+                mask_and_scale=mask_and_scale,
+                decode_times=decode_times,
+                concat_characters=concat_characters,
+                decode_coords=decode_coords,
+                drop_variables=drop_variables,
+                use_cftime=use_cftime,
+                decode_timedelta=decode_timedelta,
+            )
+        return ds
+
+
+if has_cfgrib:
+    BACKEND_ENTRYPOINTS["cfgrib"] = CfgribfBackendEntrypoint