Enhance and move ISO-8601 parser to coding.times (#9899)

Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: Spencer Clark <[email protected]> Co-authored-by: Michael Niklas <[email protected]> Co-authored-by: Deepak Cherian <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Michael Niklas <[email protected]>
pydata · Dec 31, 2024 · b14080e · b14080e
1 parent 33bf5e8
commit b14080e
Show file tree

Hide file tree

Showing 7 changed files with 161 additions and 87 deletions.
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -75,6 +75,9 @@ Internal Changes
   within ``as_compatible_data``. This is consistent with how lists of these objects
   will be converted (:pull:`9900`).
   By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
+- Move ISO-8601 parser from coding.cftimeindex to coding.times to make it available there (prevents circular import), add capability to parse negative and/or five-digit years (:pull:`9899`).
+  By `Kai Mühlbauer <https://github.com/kmuehlbauer>`_.
+
 
 .. _whats-new.2024.11.0:
 

diff --git a/properties/test_encode_decode.py b/properties/test_encode_decode.py
@@ -5,17 +5,22 @@
 
 """
 
+import warnings
+
 import pytest
 
 pytest.importorskip("hypothesis")
 # isort: split
 
 import hypothesis.extra.numpy as npst
+import hypothesis.strategies as st
 import numpy as np
 from hypothesis import given
 
 import xarray as xr
-from xarray.testing.strategies import variables
+from xarray.coding.times import _parse_iso8601
+from xarray.testing.strategies import CFTimeStrategyISO8601, variables
+from xarray.tests import requires_cftime
 
 
 @pytest.mark.slow
@@ -43,3 +48,13 @@ def test_CFScaleOffset_coder_roundtrip(original) -> None:
     coder = xr.coding.variables.CFScaleOffsetCoder()
     roundtripped = coder.decode(coder.encode(original))
     xr.testing.assert_identical(original, roundtripped)
+
+
+@requires_cftime
+@given(dt=st.datetimes() | CFTimeStrategyISO8601())
+def test_iso8601_decode(dt):
+    iso = dt.isoformat()
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
+        parsed, _ = _parse_iso8601(type(dt), iso)
+        assert dt == parsed
diff --git a/xarray/coding/cftime_offsets.py b/xarray/coding/cftime_offsets.py
@@ -53,9 +53,10 @@
 import pandas as pd
 from packaging.version import Version
 
-from xarray.coding.cftimeindex import CFTimeIndex, _parse_iso8601_with_reso
+from xarray.coding.cftimeindex import CFTimeIndex
 from xarray.coding.times import (
     _is_standard_calendar,
+    _parse_iso8601,
     _should_cftime_be_used,
     convert_time_or_go_back,
     format_cftime_datetime,
@@ -843,7 +844,7 @@ def to_cftime_datetime(date_str_or_date, calendar=None):
                 "If converting a string to a cftime.datetime object, "
                 "a calendar type must be provided"
             )
-        date, _ = _parse_iso8601_with_reso(get_date_type(calendar), date_str_or_date)
+        date, _ = _parse_iso8601(get_date_type(calendar), date_str_or_date)
         return date
     elif isinstance(date_str_or_date, cftime.datetime):
         return date_str_or_date

diff --git a/xarray/coding/cftimeindex.py b/xarray/coding/cftimeindex.py
@@ -42,7 +42,6 @@
 from __future__ import annotations
 
 import math
-import re
 import warnings
 from datetime import timedelta
 from typing import TYPE_CHECKING, Any
@@ -53,6 +52,7 @@
 
 from xarray.coding.times import (
     _STANDARD_CALENDARS,
+    _parse_iso8601,
     cftime_to_nptime,
     infer_calendar_name,
 )
@@ -78,71 +78,6 @@
     OUT_OF_BOUNDS_TIMEDELTA_ERRORS = (OverflowError,)
 
 
-def named(name, pattern):
-    return "(?P<" + name + ">" + pattern + ")"
-
-
-def optional(x):
-    return "(?:" + x + ")?"
-
-
-def trailing_optional(xs):
-    if not xs:
-        return ""
-    return xs[0] + optional(trailing_optional(xs[1:]))
-
-
-def build_pattern(date_sep=r"\-", datetime_sep=r"T", time_sep=r"\:", micro_sep=r"."):
-    pieces = [
-        (None, "year", r"\d{4}"),
-        (date_sep, "month", r"\d{2}"),
-        (date_sep, "day", r"\d{2}"),
-        (datetime_sep, "hour", r"\d{2}"),
-        (time_sep, "minute", r"\d{2}"),
-        (time_sep, "second", r"\d{2}"),
-        (micro_sep, "microsecond", r"\d{1,6}"),
-    ]
-    pattern_list = []
-    for sep, name, sub_pattern in pieces:
-        pattern_list.append((sep if sep else "") + named(name, sub_pattern))
-        # TODO: allow timezone offsets?
-    return "^" + trailing_optional(pattern_list) + "$"
-
-
-_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
-_EXTENDED_PATTERN = build_pattern()
-_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
-_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]
-
-
-def parse_iso8601_like(datetime_string):
-    for pattern in _PATTERNS:
-        match = re.match(pattern, datetime_string)
-        if match:
-            return match.groupdict()
-    raise ValueError(
-        f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
-    )
-
-
-def _parse_iso8601_with_reso(date_type, timestr):
-    _ = attempt_import("cftime")
-
-    default = date_type(1, 1, 1)
-    result = parse_iso8601_like(timestr)
-    replace = {}
-
-    for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
-        value = result.get(attr, None)
-        if value is not None:
-            if attr == "microsecond":
-                # convert match string into valid microsecond value
-                value = 10 ** (6 - len(value)) * int(value)
-            replace[attr] = int(value)
-            resolution = attr
-    return default.replace(**replace), resolution
-
-
 def _parsed_string_to_bounds(date_type, resolution, parsed):
     """Generalization of
     pandas.tseries.index.DatetimeIndex._parsed_string_to_bounds
@@ -436,7 +371,7 @@ def _partial_date_slice(self, resolution, parsed):
 
     def _get_string_slice(self, key):
         """Adapted from pandas.tseries.index.DatetimeIndex._get_string_slice"""
-        parsed, resolution = _parse_iso8601_with_reso(self.date_type, key)
+        parsed, resolution = _parse_iso8601(self.date_type, key)
         try:
             loc = self._partial_date_slice(resolution, parsed)
         except KeyError as err:
@@ -483,7 +418,7 @@ def _maybe_cast_slice_bound(self, label, side):
         if not isinstance(label, str):
             return label
 
-        parsed, resolution = _parse_iso8601_with_reso(self.date_type, label)
+        parsed, resolution = _parse_iso8601(self.date_type, label)
         start, end = _parsed_string_to_bounds(self.date_type, resolution, parsed)
         if self.is_monotonic_decreasing and len(self) > 1:
             return end if side == "left" else start
@@ -811,11 +746,6 @@ def is_leap_year(self):
         return func(self.year, calendar=self.calendar)
 
 
-def _parse_iso8601_without_reso(date_type, datetime_str):
-    date, _ = _parse_iso8601_with_reso(date_type, datetime_str)
-    return date
-
-
 def _parse_array_of_cftime_strings(strings, date_type):
     """Create a numpy array from an array of strings.
 
@@ -833,9 +763,9 @@ def _parse_array_of_cftime_strings(strings, date_type):
     -------
     np.array
     """
-    return np.array(
-        [_parse_iso8601_without_reso(date_type, s) for s in strings.ravel()]
-    ).reshape(strings.shape)
+    return np.array([_parse_iso8601(date_type, s)[0] for s in strings.ravel()]).reshape(
+        strings.shape
+    )
 
 
 def _contains_datetime_timedeltas(array):

diff --git a/xarray/coding/times.py b/xarray/coding/times.py
@@ -189,6 +189,76 @@ def _unpack_netcdf_time_units(units: str) -> tuple[str, str]:
     return delta_units, ref_date
 
 
+def named(name: str, pattern: str) -> str:
+    return "(?P<" + name + ">" + pattern + ")"
+
+
+def optional(x: str) -> str:
+    return "(?:" + x + ")?"
+
+
+def trailing_optional(xs: list[str]) -> str:
+    if not xs:
+        return ""
+    return xs[0] + optional(trailing_optional(xs[1:]))
+
+
+def build_pattern(
+    date_sep: str = r"\-",
+    datetime_sep: str = r"T",
+    time_sep: str = r"\:",
+    micro_sep: str = r".",
+) -> str:
+    pieces = [
+        (None, "year", r"[+-]?\d{4,5}"),
+        (date_sep, "month", r"\d{2}"),
+        (date_sep, "day", r"\d{2}"),
+        (datetime_sep, "hour", r"\d{2}"),
+        (time_sep, "minute", r"\d{2}"),
+        (time_sep, "second", r"\d{2}"),
+        (micro_sep, "microsecond", r"\d{1,6}"),
+    ]
+    pattern_list = []
+    for sep, name, sub_pattern in pieces:
+        pattern_list.append((sep if sep else "") + named(name, sub_pattern))
+        # TODO: allow timezone offsets?
+    return "^" + trailing_optional(pattern_list) + "$"
+
+
+_BASIC_PATTERN = build_pattern(date_sep="", time_sep="")
+_EXTENDED_PATTERN = build_pattern()
+_CFTIME_PATTERN = build_pattern(datetime_sep=" ")
+_PATTERNS = [_BASIC_PATTERN, _EXTENDED_PATTERN, _CFTIME_PATTERN]
+
+
+def parse_iso8601_like(datetime_string: str) -> dict[str, str | None]:
+    for pattern in _PATTERNS:
+        match = re.match(pattern, datetime_string)
+        if match:
+            return match.groupdict()
+    raise ValueError(
+        f"no ISO-8601 or cftime-string-like match for string: {datetime_string}"
+    )
+
+
+def _parse_iso8601(date_type, timestr):
+    default = date_type(1, 1, 1)
+    result = parse_iso8601_like(timestr)
+    replace = {}
+
+    for attr in ["year", "month", "day", "hour", "minute", "second", "microsecond"]:
+        value = result.get(attr, None)
+        if value is not None:
+            resolution = attr
+            if attr == "microsecond":
+                if len(value) <= 3:
+                    resolution = "millisecond"
+                # convert match string into valid microsecond value
+                value = 10 ** (6 - len(value)) * int(value)
+            replace[attr] = int(value)
+    return default.replace(**replace), resolution
+
+
 def _unpack_time_units_and_ref_date(units: str) -> tuple[str, pd.Timestamp]:
     # same us _unpack_netcdf_time_units but finalizes ref_date for
     # processing in encode_cf_datetime

diff --git a/xarray/testing/strategies.py b/xarray/testing/strategies.py
@@ -1,3 +1,5 @@
+import datetime
+import warnings
 from collections.abc import Hashable, Iterable, Mapping, Sequence
 from typing import TYPE_CHECKING, Any, Protocol, overload
 
@@ -473,3 +475,36 @@ def unique_subset_of(
     return (
         {k: objs[k] for k in subset_keys} if isinstance(objs, Mapping) else subset_keys
     )
+
+
+class CFTimeStategy(st.SearchStrategy):
+    def __init__(self, min_value, max_value):
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def do_draw(self, data):
+        unit_microsecond = datetime.timedelta(microseconds=1)
+        timespan_microseconds = (self.max_value - self.min_value) // unit_microsecond
+        result = data.draw_integer(0, timespan_microseconds)
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
+            return self.min_value + datetime.timedelta(microseconds=result)
+
+
+class CFTimeStrategyISO8601(st.SearchStrategy):
+    def __init__(self):
+        from xarray.tests.test_coding_times import _all_cftime_date_types
+
+        self.date_types = _all_cftime_date_types()
+        self.calendars = list(self.date_types)
+
+    def do_draw(self, data):
+        calendar = data.draw(st.sampled_from(self.calendars))
+        date_type = self.date_types[calendar]
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", message=".*date/calendar/year zero.*")
+            daysinmonth = date_type(99999, 12, 1).daysinmonth
+            min_value = date_type(-99999, 1, 1)
+            max_value = date_type(99999, 12, daysinmonth, 23, 59, 59, 999999)
+            strategy = CFTimeStategy(min_value, max_value)
+            return strategy.do_draw(data)
diff --git a/xarray/tests/test_cftimeindex.py b/xarray/tests/test_cftimeindex.py
@@ -12,9 +12,11 @@
 from xarray.coding.cftimeindex import (
     CFTimeIndex,
     _parse_array_of_cftime_strings,
-    _parse_iso8601_with_reso,
     _parsed_string_to_bounds,
     assert_all_valid_date_type,
+)
+from xarray.coding.times import (
+    _parse_iso8601,
     parse_iso8601_like,
 )
 from xarray.tests import (
@@ -132,16 +134,34 @@ def date_dict(
     list(ISO8601_LIKE_STRING_TESTS.values()),
     ids=list(ISO8601_LIKE_STRING_TESTS.keys()),
 )
-def test_parse_iso8601_like(string, expected):
-    result = parse_iso8601_like(string)
+@pytest.mark.parametrize(
+    "five_digit_year", [False, True], ids=["four-digit-year", "five-digit-year"]
+)
+@pytest.mark.parametrize("sign", ["", "+", "-"], ids=["None", "plus", "minus"])
+def test_parse_iso8601_like(
+    five_digit_year: bool, sign: str, string: str, expected: dict
+) -> None:
+    pre = "1" if five_digit_year else ""
+    datestring = sign + pre + string
+    result = parse_iso8601_like(datestring)
+    expected = expected.copy()
+    expected.update(year=sign + pre + expected["year"])
     assert result == expected
 
-    if result["microsecond"] is None:
+    # check malformed single digit addendum
+    # this check is only performed when we have at least "hour" given
+    # like "1999010101", where a single added digit should raise
+    # for "1999" (year), "199901" (month) and "19990101" (day)
+    # and a single added digit the string would just be interpreted
+    # as having a 5-digit year.
+    if result["microsecond"] is None and result["hour"] is not None:
         with pytest.raises(ValueError):
-            parse_iso8601_like(string + "3")
-    if result["second"] is None:
+            parse_iso8601_like(datestring + "3")
+
+    # check malformed floating point addendum
+    if result["second"] is None or result["microsecond"] is not None:
         with pytest.raises(ValueError):
-            parse_iso8601_like(string + ".3")
+            parse_iso8601_like(datestring + ".3")
 
 
 _CFTIME_CALENDARS = [
@@ -348,7 +368,7 @@ def test_cftimeindex_days_in_month_accessor(index):
 def test_parse_iso8601_with_reso(date_type, string, date_args, reso):
     expected_date = date_type(*date_args)
     expected_reso = reso
-    result_date, result_reso = _parse_iso8601_with_reso(date_type, string)
+    result_date, result_reso = _parse_iso8601(date_type, string)
     assert result_date == expected_date
     assert result_reso == expected_reso