Skip to content

Commit

Permalink
Don't set encoding attributes on bounds variables. (#2965)
Browse files Browse the repository at this point in the history
* Don't set attributes on bounds variables.

Fixes #2921

1. Removes certain attributes from bounds variables on encode.
2. open_mfdataset: Sets encoding on variables based on encoding in first file.

* remove whitespace stuff.

* Make sure variable exists in first file before assigning encoding

* Make sure we iterate over coords too.

* lint fix.

* docs/comment fixes.

* mfdataset encoding test.

* time_bounds attrs test + allow for slight CF non-compliance.

* I need to deal with encoding!

* minor fixes.

* another minor fix.

* review fixes.

* lint fixes.

* Remove encoding changes and xfail test.

* Update whats-new.rst
  • Loading branch information
dcherian authored Jun 25, 2019
1 parent b054c31 commit 76adf13
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 12 deletions.
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ Enhancements
Bug fixes
~~~~~~~~~

- Don't set encoding attributes on bounds variables when writing to netCDF.
(:issue:`2921`)
By `Deepak Cherian <https://github.com/dcherian>`_.
- NetCDF4 output: variables with unlimited dimensions must be chunked (not
contiguous) on output. (:issue:`1849`)
By `James McCreight <https://github.com/jmccreight>`_.
Expand Down
85 changes: 75 additions & 10 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .coding import strings, times, variables
from .coding.variables import SerializationWarning
from .core import duck_array_ops, indexing
from .core.common import contains_cftime_datetimes
from .core.pycompat import dask_array_type
from .core.variable import IndexVariable, Variable, as_variable

Expand Down Expand Up @@ -355,6 +356,51 @@ def _update_bounds_attributes(variables):
bounds_attrs.setdefault('calendar', attrs['calendar'])


def _update_bounds_encoding(variables):
"""Adds time encoding to time bounds variables.
Variables handling time bounds ("Cell boundaries" in the CF
conventions) do not necessarily carry the necessary attributes to be
decoded. This copies the encoding from the time variable to the
associated bounds variable so that we write CF-compliant files.
See Also:
http://cfconventions.org/Data/cf-conventions/cf-conventions-1.7/
cf-conventions.html#cell-boundaries
https://github.com/pydata/xarray/issues/2565
"""

# For all time variables with bounds
for v in variables.values():
attrs = v.attrs
encoding = v.encoding
has_date_units = 'units' in encoding and 'since' in encoding['units']
is_datetime_type = (np.issubdtype(v.dtype, np.datetime64) or
contains_cftime_datetimes(v))

if (is_datetime_type and not has_date_units and
'bounds' in attrs and attrs['bounds'] in variables):
warnings.warn("Variable '{0}' has datetime type and a "
"bounds variable but {0}.encoding does not have "
"units specified. The units encodings for '{0}' "
"and '{1}' will be determined independently "
"and may not be equal, counter to CF-conventions. "
"If this is a concern, specify a units encoding for "
"'{0}' before writing to a file."
.format(v.name, attrs['bounds']),
UserWarning)

if has_date_units and 'bounds' in attrs:
if attrs['bounds'] in variables:
bounds_encoding = variables[attrs['bounds']].encoding
bounds_encoding.setdefault('units', encoding['units'])
if 'calendar' in encoding:
bounds_encoding.setdefault('calendar',
encoding['calendar'])


def decode_cf_variables(variables, attributes, concat_characters=True,
mask_and_scale=True, decode_times=True,
decode_coords=True, drop_variables=None,
Expand Down Expand Up @@ -492,8 +538,6 @@ def cf_decoder(variables, attributes,
"""
Decode a set of CF encoded variables and attributes.
See Also, decode_cf_variable
Parameters
----------
variables : dict
Expand All @@ -515,6 +559,10 @@ def cf_decoder(variables, attributes,
A dictionary mapping from variable name to xarray.Variable objects.
decoded_attributes : dict
A dictionary mapping from attribute name to values.
See also
--------
decode_cf_variable
"""
variables, attributes, _ = decode_cf_variables(
variables, attributes, concat_characters, mask_and_scale, decode_times)
Expand Down Expand Up @@ -595,14 +643,12 @@ def encode_dataset_coordinates(dataset):

def cf_encoder(variables, attributes):
"""
A function which takes a dicts of variables and attributes
and encodes them to conform to CF conventions as much
as possible. This includes masking, scaling, character
array handling, and CF-time encoding.
Decode a set of CF encoded variables and attributes.
Encode a set of CF encoded variables and attributes.
Takes a dicts of variables and attributes and encodes them
to conform to CF conventions as much as possible.
This includes masking, scaling, character array handling,
and CF-time encoding.
See Also, decode_cf_variable
Parameters
----------
Expand All @@ -618,8 +664,27 @@ def cf_encoder(variables, attributes):
encoded_attributes : dict
A dictionary mapping from attribute name to value
See also: encode_cf_variable
See also
--------
decode_cf_variable, encode_cf_variable
"""

# add encoding for time bounds variables if present.
_update_bounds_encoding(variables)

new_vars = OrderedDict((k, encode_cf_variable(v, name=k))
for k, v in variables.items())

# Remove attrs from bounds variables (issue #2921)
for var in new_vars.values():
bounds = var.attrs['bounds'] if 'bounds' in var.attrs else None
if bounds and bounds in new_vars:
# see http://cfconventions.org/cf-conventions/cf-conventions.html#cell-boundaries # noqa
for attr in ['units', 'standard_name', 'axis', 'positive',
'calendar', 'long_name', 'leap_month', 'leap_year',
'month_lengths']:
if attr in new_vars[bounds].attrs and attr in var.attrs:
if new_vars[bounds].attrs[attr] == var.attrs[attr]:
new_vars[bounds].attrs.pop(attr)

return new_vars, attributes
21 changes: 21 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2486,6 +2486,27 @@ def test_attrs_mfdataset(self):
'no attribute'):
actual.test2

@pytest.mark.xfail(reason='mfdataset loses encoding currently.')
def test_encoding_mfdataset(self):
original = Dataset({'foo': ('t', np.random.randn(10)),
't': ('t', pd.date_range(start='2010-01-01',
periods=10,
freq='1D'))})
original.t.encoding['units'] = 'days since 2010-01-01'

with create_tmp_file() as tmp1:
with create_tmp_file() as tmp2:
ds1 = original.isel(t=slice(5))
ds2 = original.isel(t=slice(5, 10))
ds1.t.encoding['units'] = 'days since 2010-01-01'
ds2.t.encoding['units'] = 'days since 2000-01-01'
ds1.to_netcdf(tmp1)
ds2.to_netcdf(tmp2)
with open_mfdataset([tmp1, tmp2]) as actual:
assert actual.t.encoding['units'] == original.t.encoding['units'] # noqa
assert actual.t.encoding['units'] == ds1.t.encoding['units'] # noqa
assert actual.t.encoding['units'] != ds2.t.encoding['units'] # noqa

def test_preprocess_mfdataset(self):
original = Dataset({'foo': ('x', np.random.randn(10))})
with create_tmp_file() as tmp:
Expand Down
49 changes: 47 additions & 2 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import pandas as pd
import pytest

from xarray import DataArray, Variable, coding, decode_cf
from xarray import DataArray, Dataset, Variable, coding, decode_cf
from xarray.coding.times import (
_import_cftime, cftime_to_nptime, decode_cf_datetime, encode_cf_datetime)
from xarray.coding.variables import SerializationWarning
from xarray.conventions import _update_bounds_attributes
from xarray.conventions import _update_bounds_attributes, cf_encoder
from xarray.core.common import contains_cftime_datetimes
from xarray.testing import assert_equal

Expand Down Expand Up @@ -671,6 +671,51 @@ def test_decode_cf_time_bounds():
_update_bounds_attributes(ds.variables)


@requires_cftime_or_netCDF4
def test_encode_time_bounds():

time = pd.date_range('2000-01-16', periods=1)
time_bounds = pd.date_range('2000-01-01', periods=2, freq='MS')
ds = Dataset(dict(time=time, time_bounds=time_bounds))
ds.time.attrs = {'bounds': 'time_bounds'}
ds.time.encoding = {'calendar': 'noleap',
'units': 'days since 2000-01-01'}

expected = dict()
# expected['time'] = Variable(data=np.array([15]), dims=['time'])
expected['time_bounds'] = Variable(data=np.array([0, 31]),
dims=['time_bounds'])

encoded, _ = cf_encoder(ds.variables, ds.attrs)
assert_equal(encoded['time_bounds'], expected['time_bounds'])
assert 'calendar' not in encoded['time_bounds'].attrs
assert 'units' not in encoded['time_bounds'].attrs

# if time_bounds attrs are same as time attrs, it doesn't matter
ds.time_bounds.encoding = {'calendar': 'noleap',
'units': 'days since 2000-01-01'}
encoded, _ = cf_encoder({k: ds[k] for k in ds.variables},
ds.attrs)
assert_equal(encoded['time_bounds'], expected['time_bounds'])
assert 'calendar' not in encoded['time_bounds'].attrs
assert 'units' not in encoded['time_bounds'].attrs

# for CF-noncompliant case of time_bounds attrs being different from
# time attrs; preserve them for faithful roundtrip
ds.time_bounds.encoding = {'calendar': 'noleap',
'units': 'days since 1849-01-01'}
encoded, _ = cf_encoder({k: ds[k] for k in ds.variables},
ds.attrs)
with pytest.raises(AssertionError):
assert_equal(encoded['time_bounds'], expected['time_bounds'])
assert 'calendar' not in encoded['time_bounds'].attrs
assert encoded['time_bounds'].attrs['units'] == ds.time_bounds.encoding['units'] # noqa

ds.time.encoding = {}
with pytest.warns(UserWarning):
cf_encoder(ds.variables, ds.attrs)


@pytest.fixture(params=_ALL_CALENDARS)
def calendar(request):
return request.param
Expand Down

0 comments on commit 76adf13

Please sign in to comment.