Skip to content

Commit

Permalink
Better chunking error messages for zarr backend (#3983)
Browse files Browse the repository at this point in the history
  • Loading branch information
dcherian authored Apr 22, 2020
1 parent 0cd14a5 commit e1f0f98
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 24 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ Documentation

Internal Changes
~~~~~~~~~~~~~~~~
- Raise more informative error messages for chunk size conflicts when writing to zarr files.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Run the ``isort`` pre-commit hook only on python source files
and update the ``flake8`` version. (:issue:`3750`, :pull:`3711`)
By `Justus Magin <https://github.com/keewis>`_.
Expand Down
50 changes: 29 additions & 21 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def __getitem__(self, key):
# could possibly have a work-around for 0d data here


def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name):
"""
Given encoding chunks (possibly None) and variable chunks (possibly None)
"""
Expand All @@ -88,15 +88,16 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
if var_chunks and enc_chunks is None:
if any(len(set(chunks[:-1])) > 1 for chunks in var_chunks):
raise ValueError(
"Zarr requires uniform chunk sizes except for final chunk."
" Variable dask chunks %r are incompatible. Consider "
"rechunking using `chunk()`." % (var_chunks,)
"Zarr requires uniform chunk sizes except for final chunk. "
f"Variable named {name!r} has incompatible dask chunks: {var_chunks!r}. "
"Consider rechunking using `chunk()`."
)
if any((chunks[0] < chunks[-1]) for chunks in var_chunks):
raise ValueError(
"Final chunk of Zarr array must be the same size or smaller "
"than the first. Variable Dask chunks %r are incompatible. "
"Consider rechunking using `chunk()`." % var_chunks
f"than the first. Variable named {name!r} has incompatible Dask chunks {var_chunks!r}."
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
# return the first chunk for each dimension
return tuple(chunk[0] for chunk in var_chunks)
Expand All @@ -114,13 +115,15 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):

if len(enc_chunks_tuple) != ndim:
# throw away encoding chunks, start over
return _determine_zarr_chunks(None, var_chunks, ndim)
return _determine_zarr_chunks(None, var_chunks, ndim, name)

for x in enc_chunks_tuple:
if not isinstance(x, int):
raise TypeError(
"zarr chunks must be an int or a tuple of ints. "
"Instead found %r" % (enc_chunks_tuple,)
"zarr chunk sizes specified in `encoding['chunks']` "
"must be an int or a tuple of ints. "
f"Instead found encoding['chunks']={enc_chunks_tuple!r} "
f"for variable named {name!r}."
)

# if there are chunks in encoding and the variable data is a numpy array,
Expand All @@ -142,19 +145,22 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim):
for dchunk in dchunks[:-1]:
if dchunk % zchunk:
raise NotImplementedError(
"Specified zarr chunks %r would overlap multiple dask "
"chunks %r. This is not implemented in xarray yet. "
" Consider rechunking the data using "
"`chunk()` or specifying different chunks in encoding."
% (enc_chunks_tuple, var_chunks)
f"Specified zarr chunks encoding['chunks']={enc_chunks_tuple!r} for "
f"variable named {name!r} would overlap multiple dask chunks {var_chunks!r}. "
"This is not implemented in xarray yet. "
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
if dchunks[-1] > zchunk:
raise ValueError(
"Final chunk of Zarr array must be the same size or "
"smaller than the first. The specified Zarr chunk "
"encoding is %r, but %r in variable Dask chunks %r is "
"incompatible. Consider rechunking using `chunk()`."
% (enc_chunks_tuple, dchunks, var_chunks)
"smaller than the first. "
f"Specified Zarr chunk encoding['chunks']={enc_chunks_tuple}, "
f"for variable named {name!r} "
f"but {dchunks} in the variable's Dask chunks {var_chunks} is "
"incompatible with this encoding. "
"Consider either rechunking using `chunk()` or instead deleting "
"or modifying `encoding['chunks']`."
)
return enc_chunks_tuple

Expand All @@ -177,7 +183,7 @@ def _get_zarr_dims_and_attrs(zarr_obj, dimension_key):
return dimensions, attributes


def extract_zarr_variable_encoding(variable, raise_on_invalid=False):
def extract_zarr_variable_encoding(variable, raise_on_invalid=False, name=None):
"""
Extract zarr encoding dictionary from xarray Variable
Expand Down Expand Up @@ -207,7 +213,7 @@ def extract_zarr_variable_encoding(variable, raise_on_invalid=False):
del encoding[k]

chunks = _determine_zarr_chunks(
encoding.get("chunks"), variable.chunks, variable.ndim
encoding.get("chunks"), variable.chunks, variable.ndim, name
)
encoding["chunks"] = chunks
return encoding
Expand Down Expand Up @@ -453,7 +459,9 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
writer.add(v.data, zarr_array, region=tuple(new_region))
else:
# new variable
encoding = extract_zarr_variable_encoding(v, raise_on_invalid=check)
encoding = extract_zarr_variable_encoding(
v, raise_on_invalid=check, name=vn
)
encoded_attrs = {}
# the magic for storing the hidden dimension data
encoded_attrs[DIMENSION_KEY] = dims
Expand Down
22 changes: 19 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1685,11 +1685,27 @@ def test_chunk_encoding_with_dask(self):

# should fail if dask_chunks are irregular...
ds_chunk_irreg = ds.chunk({"x": (5, 4, 3)})
with pytest.raises(ValueError) as e_info:
with raises_regex(ValueError, "uniform chunk sizes."):
with self.roundtrip(ds_chunk_irreg) as actual:
pass
# make sure this error message is correct and not some other error
assert e_info.match("chunks")

# should fail if encoding["chunks"] clashes with dask_chunks
badenc = ds.chunk({"x": 4})
badenc.var1.encoding["chunks"] = (6,)
with raises_regex(NotImplementedError, "named 'var1' would overlap"):
with self.roundtrip(badenc) as actual:
pass

badenc.var1.encoding["chunks"] = (2,)
with raises_regex(ValueError, "Specified Zarr chunk encoding"):
with self.roundtrip(badenc) as actual:
pass

badenc = badenc.chunk({"x": (3, 3, 6)})
badenc.var1.encoding["chunks"] = (3,)
with raises_regex(ValueError, "incompatible with this encoding"):
with self.roundtrip(badenc) as actual:
pass

# ... except if the last chunk is smaller than the first
ds_chunk_irreg = ds.chunk({"x": (5, 5, 2)})
Expand Down

0 comments on commit e1f0f98

Please sign in to comment.