pydata · Mikejmnez · Apr 23, 2020 · Apr 23, 2020 · Apr 23, 2020 · Apr 23, 2020
diff --git a/default.profraw b/default.profraw
diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -303,6 +303,7 @@ def open_dataset(
     drop_variables=None,
     backend_kwargs=None,
     use_cftime=None,
+    overwrite_encoded_chunks=False,
 ):
     """Open and decode a dataset from a file or file-like object.
 
@@ -345,7 +346,7 @@ def open_dataset(
         If True, decode the 'coordinates' attribute to identify coordinates in
         the resulting dataset.
     engine : {'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', 'cfgrib', \
-        'pseudonetcdf'}, optional
+        'pseudonetcdf', 'zarr'}, optional
         Engine to use when reading files. If not provided, the default engine
         is chosen based on available dependencies, with a preference for
         'netcdf4'.
@@ -383,6 +384,10 @@ def open_dataset(
         represented using ``np.datetime64[ns]`` objects.  If False, always
         decode times to ``np.datetime64[ns]`` objects; if this is not possible
         raise an error.
+    overwrite_encoded_chunks: bool, optional
+        Whether to drop the zarr chunks encoded for each variable when a
+        dataset is loaded with specified chunk sizes (default: False)
+
 
     Returns
     -------
@@ -409,7 +414,9 @@ def open_dataset(
         "pynio",
         "cfgrib",
         "pseudonetcdf",
+        "zarr",
     ]
+
     if engine not in engines:
         raise ValueError(
             "unrecognized engine for open_dataset: {}\n"
@@ -455,36 +462,7 @@ def maybe_decode_store(store, lock=False):
 
         _protect_dataset_variables_inplace(ds, cache)
 
-        if chunks is not None:
-            from dask.base import tokenize
-
-            # if passed an actual file path, augment the token with
-            # the file modification time
-            if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
-                mtime = os.path.getmtime(filename_or_obj)
-            else:
-                mtime = None
-            token = tokenize(
-                filename_or_obj,
-                mtime,
-                group,
-                decode_cf,
-                mask_and_scale,
-                decode_times,
-                concat_characters,
-                decode_coords,
-                engine,
-                chunks,
-                drop_variables,
-                use_cftime,
-            )
-            name_prefix = "open_dataset-%s" % token
-            ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
-            ds2._file_obj = ds._file_obj
-        else:
-            ds2 = ds
-
-        return ds2
+        return ds
 
     if isinstance(filename_or_obj, Path):
         filename_or_obj = str(filename_or_obj)
@@ -519,7 +497,14 @@ def maybe_decode_store(store, lock=False):
             store = backends.CfGribDataStore(
                 filename_or_obj, lock=lock, **backend_kwargs
             )
-
+        elif engine == "zarr":
+            # on ZarrStore, mode='r', synchronizer=None, group=None,
+            # consolidated=False.
+            store = backends.ZarrStore.open_group(
+                filename_or_obj,
+                group=group,
+                **backend_kwargs
+            )
     else:
         if engine not in [None, "scipy", "h5netcdf"]:
             raise ValueError(
@@ -542,7 +527,99 @@ def maybe_decode_store(store, lock=False):
         if isinstance(filename_or_obj, str):
             ds.encoding["source"] = filename_or_obj
 
-    return ds
+    if chunks is not None:
+        from dask.base import tokenize
+        if engine != 'zarr':
+
+            # if passed an actual file path, augment the token with
+            # the file modification time
+            if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj):
+                mtime = os.path.getmtime(filename_or_obj)
+            else:
+                mtime = None
+            token = tokenize(
+                filename_or_obj,
+                mtime,
+                group,
+                decode_cf,
+                mask_and_scale,
+                decode_times,
+                concat_characters,
+                decode_coords,
+                engine,
+                chunks,
+                drop_variables,
+                use_cftime,
+            )
+            name_prefix = "open_dataset-%s" % token
+            ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token)
+            ds2._file_obj = ds._file_obj
+
+        else:  # file is zarr!
+
+            # adapted from Dataset.Chunk() and taken from open_zarr
+            if not isinstance(chunks, (int, dict)):
+                if chunks != "auto":
+                    raise ValueError(
+                        "chunks must be an int, dict, 'auto', or None. "
+                        "Instead found %s. " % chunks
+                    )
+            if isinstance(chunks, int):
+                chunks = dict.fromkeys(ds.dims, chunks)
+
+            if isinstance(chunks, tuple) and len(chunks) == len(ds.dims):
+                chunks = dict(zip(ds.dims, chunks))
+
+            def get_chunk(name, var, chunks):
+                chunk_spec = dict(zip(var.dims, var.encoding.get("chunks")))
+
+                # Coordinate labels aren't chunked
+                if var.ndim == 1 and var.dims[0] == name:
+                    return chunk_spec
+
+                if chunks == "auto":
+                    return chunk_spec
+
+                for dim in var.dims:
+                    if dim in chunks:
+                        spec = chunks[dim]
+                        if isinstance(spec, int):
+                            spec = (spec,)
+                        if isinstance(spec, (tuple, list)) and chunk_spec[dim]:
+                            if any(s % chunk_spec[dim] for s in spec):
+                                warnings.warn(
+                                    "Specified Dask chunks %r would "
+                                    "separate Zarr chunk shape %r for "
+                                    "dimension %r. This significantly "
+                                    "degrades performance. Consider "
+                                    "rechunking after loading instead."
+                                    % (chunks[dim], chunk_spec[dim], dim),
+                                    stacklevel=2,
+                                )
+                        chunk_spec[dim] = chunks[dim]
+                return chunk_spec
+
+            def maybe_chunk(name, var, chunks):
+                chunk_spec = get_chunk(name, var, chunks)
+
+                if (var.ndim > 0) and (chunk_spec is not None):
+                    # does this cause any data to be read?
+                    token2 = tokenize(name, var._data)
+                    name2 = "zarr-%s" % token2
+                    var = var.chunk(chunk_spec, name=name2, lock=None)
+                    if overwrite_encoded_chunks and var.chunks is not None:
+                        var.encoding["chunks"] = tuple(x[0] for x in var.chunk)
+                    return var
+                else:
+                    return var
+
+            variables = {k: maybe_chunk(k, v, chunks) for k, v in ds.variables.items()}
+            ds2 = ds._replace_vars_and_dims(variables)
+        return ds2
+    else:
+        ds2 = ds
+
+    return ds2
 
 
 def open_dataarray(