Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure maximum accuracy when encoding and decoding cftime.datetime values #4758

Merged
merged 12 commits into from
Feb 10, 2021
7 changes: 7 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ Deprecations

New Features
~~~~~~~~~~~~
- Xarray now leverages updates as of cftime version 1.4.1, which enable exact I/O
rountripping of ``cftime.datetime`` objects (:pull:`4758`).
By `Spencer Clark <https://github.com/spencerkclark>`_.
- :py:meth:`~xarray.cftime_range` and :py:meth:`DataArray.resample` now support
millisecond (``"L"`` or ``"ms"``) and microsecond (``"U"`` or ``"us"``) frequencies
for ``cftime.datetime`` coordinates (:issue:`4097`, :pull:`4758`).
By `Spencer Clark <https://github.com/spencerkclark>`_.
- Significantly higher ``unstack`` performance on numpy-backed arrays which
contain missing values; 8x faster in our benchmark, and 2x faster than pandas.
(:pull:`4746`);
Expand Down
30 changes: 29 additions & 1 deletion xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -576,6 +576,26 @@ def __apply__(self, other):
return other + self.as_timedelta()


class Millisecond(BaseCFTimeOffset):
_freq = "L"

def as_timedelta(self):
return timedelta(milliseconds=self.n)

def __apply__(self, other):
return other + self.as_timedelta()


class Microsecond(BaseCFTimeOffset):
_freq = "U"

def as_timedelta(self):
return timedelta(microseconds=self.n)

def __apply__(self, other):
return other + self.as_timedelta()


_FREQUENCIES = {
"A": YearEnd,
"AS": YearBegin,
Expand All @@ -590,6 +610,10 @@ def __apply__(self, other):
"T": Minute,
"min": Minute,
"S": Second,
"L": Millisecond,
"ms": Millisecond,
"U": Microsecond,
"us": Microsecond,
"AS-JAN": partial(YearBegin, month=1),
"AS-FEB": partial(YearBegin, month=2),
"AS-MAR": partial(YearBegin, month=3),
Expand Down Expand Up @@ -824,7 +848,7 @@ def cftime_range(
`ISO-8601 format <https://en.wikipedia.org/wiki/ISO_8601>`_.
- It supports many, but not all, frequencies supported by
``pandas.date_range``. For example it does not currently support any of
the business-related, semi-monthly, or sub-second frequencies.
the business-related or semi-monthly frequencies.
- Compound sub-monthly frequencies are not supported, e.g. '1H1min', as
these can easily be written in terms of the finest common resolution,
e.g. '61min'.
Expand Down Expand Up @@ -855,6 +879,10 @@ def cftime_range(
+--------+--------------------------+
| S | Second frequency |
+--------+--------------------------+
| L, ms | Millisecond frequency |
+--------+--------------------------+
| U, us | Microsecond frequency |
+--------+--------------------------+

Any multiples of the following anchored offsets are also supported.

Expand Down
74 changes: 48 additions & 26 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import re
import warnings
from datetime import datetime
from datetime import datetime, timedelta
from distutils.version import LooseVersion
from functools import partial

Expand Down Expand Up @@ -35,6 +35,26 @@
"D": int(1e9) * 60 * 60 * 24,
}

_US_PER_TIME_DELTA = {
"microseconds": 1,
"milliseconds": 1_000,
"seconds": 1_000_000,
"minutes": 60 * 1_000_000,
"hours": 60 * 60 * 1_000_000,
"days": 24 * 60 * 60 * 1_000_000,
}

_NETCDF_TIME_UNITS_CFTIME = [
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
]

_NETCDF_TIME_UNITS_NUMPY = _NETCDF_TIME_UNITS_CFTIME + ["nanoseconds"]

TIME_UNITS = frozenset(
[
"days",
Expand Down Expand Up @@ -225,9 +245,7 @@ def decode_cf_datetime(num_dates, units, calendar=None, use_cftime=None):
if calendar in _STANDARD_CALENDARS:
dates = cftime_to_nptime(dates)
elif use_cftime:
dates = _decode_datetime_with_cftime(
flat_num_dates.astype(float), units, calendar
)
dates = _decode_datetime_with_cftime(flat_num_dates, units, calendar)
else:
dates = _decode_datetime_with_pandas(flat_num_dates, units, calendar)

Expand Down Expand Up @@ -262,25 +280,33 @@ def decode_cf_timedelta(num_timedeltas, units):
return result.reshape(num_timedeltas.shape)


def _unit_timedelta_cftime(units):
return timedelta(microseconds=_US_PER_TIME_DELTA[units])


def _unit_timedelta_numpy(units):
numpy_units = _netcdf_to_numpy_timeunit(units)
return np.timedelta64(_NS_PER_TIME_DELTA[numpy_units], "ns")


def _infer_time_units_from_diff(unique_timedeltas):
# Note that the modulus operator was only implemented for np.timedelta64
# arrays as of NumPy version 1.16.0. Once our minimum version of NumPy
# supported is greater than or equal to this we will no longer need to cast
# unique_timedeltas to a TimedeltaIndex. In the meantime, however, the
# modulus operator works for TimedeltaIndex objects.
unique_deltas_as_index = pd.TimedeltaIndex(unique_timedeltas)
for time_unit in [
"days",
"hours",
"minutes",
"seconds",
"milliseconds",
"microseconds",
"nanoseconds",
]:
delta_ns = _NS_PER_TIME_DELTA[_netcdf_to_numpy_timeunit(time_unit)]
unit_delta = np.timedelta64(delta_ns, "ns")
if np.all(unique_deltas_as_index % unit_delta == np.timedelta64(0, "ns")):
if unique_timedeltas.dtype == np.dtype("O"):
time_units = _NETCDF_TIME_UNITS_CFTIME
unit_timedelta = _unit_timedelta_cftime
zero_timedelta = timedelta(microseconds=0)
timedeltas = unique_timedeltas
else:
time_units = _NETCDF_TIME_UNITS_NUMPY
unit_timedelta = _unit_timedelta_numpy
zero_timedelta = np.timedelta64(0, "ns")
# Note that the modulus operator was only implemented for np.timedelta64
# arrays as of NumPy version 1.16.0. Once our minimum version of NumPy
# supported is greater than or equal to this we will no longer need to cast
# unique_timedeltas to a TimedeltaIndex. In the meantime, however, the
# modulus operator works for TimedeltaIndex objects.
timedeltas = pd.TimedeltaIndex(unique_timedeltas)
for time_unit in time_units:
if np.all(timedeltas % unit_timedelta(time_unit) == zero_timedelta):
return time_unit
return "seconds"

Expand Down Expand Up @@ -309,10 +335,6 @@ def infer_datetime_units(dates):
reference_date = dates[0] if len(dates) > 0 else "1970-01-01"
reference_date = format_cftime_datetime(reference_date)
unique_timedeltas = np.unique(np.diff(dates))
if unique_timedeltas.dtype == np.dtype("O"):
# Convert to np.timedelta64 objects using pandas to work around a
# NumPy casting bug: https://github.com/numpy/numpy/issues/11096
unique_timedeltas = to_timedelta_unboxed(unique_timedeltas)
units = _infer_time_units_from_diff(unique_timedeltas)
return f"{units} since {reference_date}"

Expand Down
1 change: 1 addition & 0 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ def LooseVersion(vstring):
has_pseudonetcdf, requires_pseudonetcdf = _importorskip("PseudoNetCDF")
has_cftime, requires_cftime = _importorskip("cftime")
has_cftime_1_1_0, requires_cftime_1_1_0 = _importorskip("cftime", minversion="1.1.0.0")
has_cftime_1_4_1, requires_cftime_1_4_1 = _importorskip("cftime", minversion="1.4.1")
has_dask, requires_dask = _importorskip("dask")
has_bottleneck, requires_bottleneck = _importorskip("bottleneck")
has_nc_time_axis, requires_nc_time_axis = _importorskip("nc_time_axis")
Expand Down
30 changes: 30 additions & 0 deletions xarray/tests/test_cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
BaseCFTimeOffset,
Day,
Hour,
Microsecond,
Millisecond,
Minute,
MonthBegin,
MonthEnd,
Expand Down Expand Up @@ -181,6 +183,14 @@ def test_to_offset_offset_input(offset):
("2min", Minute(n=2)),
("S", Second()),
("2S", Second(n=2)),
("L", Millisecond(n=1)),
("2L", Millisecond(n=2)),
("ms", Millisecond(n=1)),
("2ms", Millisecond(n=2)),
("U", Microsecond(n=1)),
("2U", Microsecond(n=2)),
("us", Microsecond(n=1)),
("2us", Microsecond(n=2)),
],
ids=_id_func,
)
Expand Down Expand Up @@ -299,6 +309,8 @@ def test_to_cftime_datetime_error_type_error():
Hour(),
Minute(),
Second(),
Millisecond(),
Microsecond(),
]
_EQ_TESTS_B = [
BaseCFTimeOffset(n=2),
Expand All @@ -316,6 +328,8 @@ def test_to_cftime_datetime_error_type_error():
Hour(n=2),
Minute(n=2),
Second(n=2),
Millisecond(n=2),
Microsecond(n=2),
]


Expand All @@ -340,6 +354,8 @@ def test_neq(a, b):
Hour(n=2),
Minute(n=2),
Second(n=2),
Millisecond(n=2),
Microsecond(n=2),
]


Expand All @@ -360,6 +376,8 @@ def test_eq(a, b):
(Hour(), Hour(n=3)),
(Minute(), Minute(n=3)),
(Second(), Second(n=3)),
(Millisecond(), Millisecond(n=3)),
(Microsecond(), Microsecond(n=3)),
]


Expand Down Expand Up @@ -387,6 +405,8 @@ def test_rmul(offset, expected):
(Hour(), Hour(n=-1)),
(Minute(), Minute(n=-1)),
(Second(), Second(n=-1)),
(Millisecond(), Millisecond(n=-1)),
(Microsecond(), Microsecond(n=-1)),
],
ids=_id_func,
)
Expand All @@ -399,6 +419,8 @@ def test_neg(offset, expected):
(Hour(n=2), (1, 1, 1, 2)),
(Minute(n=2), (1, 1, 1, 0, 2)),
(Second(n=2), (1, 1, 1, 0, 0, 2)),
(Millisecond(n=2), (1, 1, 1, 0, 0, 0, 2000)),
(Microsecond(n=2), (1, 1, 1, 0, 0, 0, 2)),
]


Expand Down Expand Up @@ -427,6 +449,8 @@ def test_radd_sub_monthly(offset, expected_date_args, calendar):
(Hour(n=2), (1, 1, 2, 22)),
(Minute(n=2), (1, 1, 2, 23, 58)),
(Second(n=2), (1, 1, 2, 23, 59, 58)),
(Millisecond(n=2), (1, 1, 2, 23, 59, 59, 998000)),
(Microsecond(n=2), (1, 1, 2, 23, 59, 59, 999998)),
],
ids=_id_func,
)
Expand Down Expand Up @@ -802,6 +826,8 @@ def test_add_quarter_end_onOffset(
((1, 1, 1), Hour(), True),
((1, 1, 1), Minute(), True),
((1, 1, 1), Second(), True),
((1, 1, 1), Millisecond(), True),
((1, 1, 1), Microsecond(), True),
],
ids=_id_func,
)
Expand Down Expand Up @@ -865,6 +891,8 @@ def test_onOffset_month_or_quarter_or_year_end(
(Hour(), (1, 3, 2, 1, 1), (1, 3, 2, 1, 1)),
(Minute(), (1, 3, 2, 1, 1, 1), (1, 3, 2, 1, 1, 1)),
(Second(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
(Millisecond(), (1, 3, 2, 1, 1, 1, 1000), (1, 3, 2, 1, 1, 1, 1000)),
(Microsecond(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
],
ids=_id_func,
)
Expand Down Expand Up @@ -914,6 +942,8 @@ def test_rollforward(calendar, offset, initial_date_args, partial_expected_date_
(Hour(), (1, 3, 2, 1, 1), (1, 3, 2, 1, 1)),
(Minute(), (1, 3, 2, 1, 1, 1), (1, 3, 2, 1, 1, 1)),
(Second(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
(Millisecond(), (1, 3, 2, 1, 1, 1, 1000), (1, 3, 2, 1, 1, 1, 1000)),
(Microsecond(), (1, 3, 2, 1, 1, 1, 1), (1, 3, 2, 1, 1, 1, 1)),
],
ids=_id_func,
)
Expand Down
45 changes: 40 additions & 5 deletions xarray/tests/test_coding_times.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
import warnings
from datetime import timedelta
from itertools import product

import numpy as np
import pandas as pd
import pytest
from pandas.errors import OutOfBoundsDatetime

from xarray import DataArray, Dataset, Variable, coding, conventions, decode_cf
from xarray import (
DataArray,
Dataset,
Variable,
cftime_range,
coding,
conventions,
decode_cf,
)
from xarray.coding.times import (
_encode_datetime_with_cftime,
cftime_to_nptime,
Expand All @@ -19,7 +28,14 @@
from xarray.core.common import contains_cftime_datetimes
from xarray.testing import assert_equal

from . import arm_xfail, assert_array_equal, has_cftime, requires_cftime, requires_dask
from . import (
arm_xfail,
assert_array_equal,
has_cftime,
has_cftime_1_4_1,
requires_cftime,
requires_dask,
)

_NON_STANDARD_CALENDARS_SET = {
"noleap",
Expand Down Expand Up @@ -973,8 +989,13 @@ def test_decode_ambiguous_time_warns(calendar):

@pytest.mark.parametrize("encoding_units", FREQUENCIES_TO_ENCODING_UNITS.values())
@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys())
def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq):
times = pd.date_range("2000", periods=3, freq=freq)
@pytest.mark.parametrize("date_range", [pd.date_range, cftime_range])
def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq, date_range):
if not has_cftime and date_range == cftime_range:
pytest.skip("Test requires cftime.")
if (freq == "N" or encoding_units == "nanoseconds") and date_range == cftime_range:
pytest.skip("Nanosecond frequency is not valid for cftime dates.")
times = date_range("2000", periods=3, freq=freq)
units = f"{encoding_units} since 2000-01-01"
encoded, _, _ = coding.times.encode_cf_datetime(times, units)

Expand All @@ -987,7 +1008,7 @@ def test_encode_cf_datetime_defaults_to_correct_dtype(encoding_units, freq):


@pytest.mark.parametrize("freq", FREQUENCIES_TO_ENCODING_UNITS.keys())
def test_encode_decode_roundtrip(freq):
def test_encode_decode_roundtrip_datetime64(freq):
# See GH 4045. Prior to GH 4684 this test would fail for frequencies of
# "S", "L", "U", and "N".
initial_time = pd.date_range("1678-01-01", periods=1)
Expand All @@ -998,6 +1019,20 @@ def test_encode_decode_roundtrip(freq):
assert_equal(variable, decoded)


@pytest.mark.parametrize("freq", ["U", "L", "S", "T", "H", "D"])
def test_encode_decode_roundtrip_cftime(freq):
if not has_cftime_1_4_1:
pytest.skip("Exact roundtripping requires at least cftime 1.4.1.")
initial_time = cftime_range("0001", periods=1)
times = initial_time.append(
cftime_range("0001", periods=2, freq=freq) + timedelta(days=291000 * 365)
)
variable = Variable(["time"], times)
encoded = conventions.encode_cf_variable(variable)
decoded = conventions.decode_cf_variable("time", encoded, use_cftime=True)
assert_equal(variable, decoded)


@requires_cftime
def test__encode_datetime_with_cftime():
# See GH 4870. cftime versions > 1.4.0 required us to adapt the
Expand Down