Skip to content

Commit

Permalink
Enhance and move ISO-8601 parser to coding.times (#9899)
Browse files Browse the repository at this point in the history
Co-authored-by: Deepak Cherian <[email protected]>
Co-authored-by: Spencer Clark <[email protected]>
Co-authored-by: Michael Niklas  <[email protected]>
Co-authored-by: Deepak Cherian <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Michael Niklas <[email protected]>
  • Loading branch information
6 people authored Dec 31, 2024
1 parent 33bf5e8 commit b14080e
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 87 deletions.
3 changes: 3 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ Internal Changes
within ``as_compatible_data``. This is consistent with how lists of these objects
will be converted (:pull:`9900`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`).
By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.


.. _whats-new.2024.11.0:

Expand Down
17 changes: 16 additions & 1 deletion properties/test_encode_decode.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,22 @@
"""

import warnings

import pytest

pytest.importorskip("hypothesis")
# isort: split

import hypothesis.extra.numpy as npst
import hypothesis.strategies as st
import numpy as np
from hypothesis import given

import xarray as xr
from xarray.testing.strategies import variables
from xarray.coding.times import _parse_iso8601
from xarray.testing.strategies import CFTimeStrategyISO8601, variables
from xarray.tests import requires_cftime


@pytest.mark.slow
Expand Down Expand Up @@ -43,3 +48,13 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None:
coder = xr.coding.variables.CFScaleOffsetCoder()
roundtripped = coder.decode(coder.encode(original))
xr.testing.assert_identical(original, roundtripped)


@requires_cftime
@given(dt=st.datetimes() | CFTimeStrategyISO8601())
def test_iso8601_decode(dt):
iso = dt.isoformat()
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
parsed, _ = _parse_iso8601(type(dt), iso)
assert dt == parsed
5 changes: 3 additions & 2 deletions xarray/coding/cftime_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@
import pandas as pd
from packaging.version import Version

from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso
from xarray.coding.cftimeindex import CFTimeIndex
from xarray.coding.times import (
_is_standard_calendar,
_parse_iso8601,
_should_cftime_be_used,
convert_time_or_go_back,
format_cftime_datetime,
Expand Down Expand Up @@ -843,7 +844,7 @@ def to_cftime_datetime(date_str_or_date, calendar=None):
"If converting a string to a cftime.datetime object, "
"a calendar type must be provided"
)
date, _ = _parse_iso8601_with_reso(get_date_type(calendar), date_str_or_date)
date, _ = _parse_iso8601(get_date_type(calendar), date_str_or_date)
return date
elif isinstance(date_str_or_date, cftime.datetime):
return date_str_or_date
Expand Down
82 changes: 6 additions & 76 deletions xarray/coding/cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@
from __future__ import annotations

import math
import re
import warnings
from datetime import timedelta
from typing import TYPE_CHECKING, Any
Expand All @@ -53,6 +52,7 @@

from xarray.coding.times import (
_STANDARD_CALENDARS,
_parse_iso8601,
cftime_to_nptime,
infer_calendar_name,
)
Expand All @@ -78,71 +78,6 @@
OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,)


def named(name, pattern):
return "(?P<" + name + ">" + pattern + ")"


def optional(x):
return "(?:" + x + ")?"


def trailing_optional(xs):
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
pieces = [
(None, "year", r"\d{4}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string):
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601_with_reso(date_type, timestr):
_ = attempt_import("cftime")

default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
if attr == "microsecond":
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
resolution = attr
return default.replace(**replace), resolution


def _parsed_string_to_bounds(date_type, resolution, parsed):
"""Generalization of
pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds
Expand Down Expand Up @@ -436,7 +371,7 @@ def _partial_date_slice(self, resolution, parsed):

def _get_string_slice(self, key):
"""Adapted from pandas.tseries.index.DatetimeIndex._get_string_slice"""
parsed, resolution = _parse_iso8601_with_reso(self.date_type, key)
parsed, resolution = _parse_iso8601(self.date_type, key)
try:
loc = self._partial_date_slice(resolution, parsed)
except KeyError as err:
Expand Down Expand Up @@ -483,7 +418,7 @@ def _maybe_cast_slice_bound(self, label, side):
if not isinstance(label, str):
return label

parsed, resolution = _parse_iso8601_with_reso(self.date_type, label)
parsed, resolution = _parse_iso8601(self.date_type, label)
start, end = _parsed_string_to_bounds(self.date_type, resolution, parsed)
if self.is_monotonic_decreasing and len(self) > 1:
return end if side == "left" else start
Expand Down Expand Up @@ -811,11 +746,6 @@ def is_leap_year(self):
return func(self.year, calendar=self.calendar)


def _parse_iso8601_without_reso(date_type, datetime_str):
date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
return date


def _parse_array_of_cftime_strings(strings, date_type):
"""Create a numpy array from an array of strings.
Expand All @@ -833,9 +763,9 @@ def _parse_array_of_cftime_strings(strings, date_type):
-------
np.array
"""
return np.array(
[_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()]
).reshape(strings.shape)
return np.array([_parse_iso8601(date_type, s)[0] for s in strings.ravel()]).reshape(
strings.shape
)


def _contains_datetime_timedeltas(array):
Expand Down
70 changes: 70 additions & 0 deletions xarray/coding/times.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,76 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
return delta_units, ref_date


def named(name: str, pattern: str) -> str:
return "(?P<" + name + ">" + pattern + ")"


def optional(x: str) -> str:
return "(?:" + x + ")?"


def trailing_optional(xs: list[str]) -> str:
if not xs:
return ""
return xs[0] + optional(trailing_optional(xs[1:]))


def build_pattern(
date_sep: str = r"\-",
datetime_sep: str = r"T",
time_sep: str = r"\:",
micro_sep: str = r".",
) -> str:
pieces = [
(None, "year", r"[+-]?\d{4,5}"),
(date_sep, "month", r"\d{2}"),
(date_sep, "day", r"\d{2}"),
(datetime_sep, "hour", r"\d{2}"),
(time_sep, "minute", r"\d{2}"),
(time_sep, "second", r"\d{2}"),
(micro_sep, "microsecond", r"\d{1,6}"),
]
pattern_list = []
for sep, name, sub_pattern in pieces:
pattern_list.append((sep if sep else "") + named(name, sub_pattern))
# TODO: allow timezone offsets?
return "^" + trailing_optional(pattern_list) + "$"


_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
_EXTENDED_PATTERN = build_pattern()
_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]


def parse_iso8601_like(datetime_string: str) -> dict[str, str | None]:
for pattern in _PATTERNS:
match = re.match(pattern, datetime_string)
if match:
return match.groupdict()
raise ValueError(
f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
)


def _parse_iso8601(date_type, timestr):
default = date_type(1, 1, 1)
result = parse_iso8601_like(timestr)
replace = {}

for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
value = result.get(attr, None)
if value is not None:
resolution = attr
if attr == "microsecond":
if len(value) <= 3:
resolution = "millisecond"
# convert match string into valid microsecond value
value = 10 ** (6 - len(value)) * int(value)
replace[attr] = int(value)
return default.replace(**replace), resolution


def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
# same us _unpack_netcdf_time_units but finalizes ref_date for
# processing in encode_cf_datetime
Expand Down
35 changes: 35 additions & 0 deletions xarray/testing/strategies.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import datetime
import warnings
from collections.abc import Hashable, Iterable, Mapping, Sequence
from typing import TYPE_CHECKING, Any, Protocol, overload

Expand Down Expand Up @@ -473,3 +475,36 @@ def unique_subset_of(
return (
{k: objs[k] for k in subset_keys} if isinstance(objs, Mapping) else subset_keys
)


class CFTimeStategy(st.SearchStrategy):
def __init__(self, min_value, max_value):
self.min_value = min_value
self.max_value = max_value

def do_draw(self, data):
unit_microsecond = datetime.timedelta(microseconds=1)
timespan_microseconds = (self.max_value - self.min_value) // unit_microsecond
result = data.draw_integer(0, timespan_microseconds)
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
return self.min_value + datetime.timedelta(microseconds=result)


class CFTimeStrategyISO8601(st.SearchStrategy):
def __init__(self):
from xarray.tests.test_coding_times import _all_cftime_date_types

self.date_types = _all_cftime_date_types()
self.calendars = list(self.date_types)

def do_draw(self, data):
calendar = data.draw(st.sampled_from(self.calendars))
date_type = self.date_types[calendar]
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
daysinmonth = date_type(99999, 12, 1).daysinmonth
min_value = date_type(-99999, 1, 1)
max_value = date_type(99999, 12, daysinmonth, 23, 59, 59, 999999)
strategy = CFTimeStategy(min_value, max_value)
return strategy.do_draw(data)
36 changes: 28 additions & 8 deletions xarray/tests/test_cftimeindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
from xarray.coding.cftimeindex import (
CFTimeIndex,
_parse_array_of_cftime_strings,
_parse_iso8601_with_reso,
_parsed_string_to_bounds,
assert_all_valid_date_type,
)
from xarray.coding.times import (
_parse_iso8601,
parse_iso8601_like,
)
from xarray.tests import (
Expand Down Expand Up @@ -132,16 +134,34 @@ def date_dict(
list(ISO8601_LIKE_STRING_TESTS.values()),
ids=list(ISO8601_LIKE_STRING_TESTS.keys()),
)
def test_parse_iso8601_like(string, expected):
result = parse_iso8601_like(string)
@pytest.mark.parametrize(
"five_digit_year", [False, True], ids=["four-digit-year", "five-digit-year"]
)
@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"])
def test_parse_iso8601_like(
five_digit_year: bool, sign: str, string: str, expected: dict
) -> None:
pre = "1" if five_digit_year else ""
datestring = sign + pre + string
result = parse_iso8601_like(datestring)
expected = expected.copy()
expected.update(year=sign + pre + expected["year"])
assert result == expected

if result["microsecond"] is None:
# check malformed single digit addendum
# this check is only performed when we have at least "hour" given
# like "1999010101", where a single added digit should raise
# for "1999" (year), "199901" (month) and "19990101" (day)
# and a single added digit the string would just be interpreted
# as having a 5-digit year.
if result["microsecond"] is None and result["hour"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + "3")
if result["second"] is None:
parse_iso8601_like(datestring + "3")

# check malformed floating point addendum
if result["second"] is None or result["microsecond"] is not None:
with pytest.raises(ValueError):
parse_iso8601_like(string + ".3")
parse_iso8601_like(datestring + ".3")


_CFTIME_CALENDARS = [
Expand Down Expand Up @@ -348,7 +368,7 @@ def test_cftimeindex_days_in_month_accessor(index):
def test_parse_iso8601_with_reso(date_type, string, date_args, reso):
expected_date = date_type(*date_args)
expected_reso = reso
result_date, result_reso = _parse_iso8601_with_reso(date_type, string)
result_date, result_reso = _parse_iso8601(date_type, string)
assert result_date == expected_date
assert result_reso == expected_reso

Expand Down

0 comments on commit b14080e

Please sign in to comment.