diff --git a/polars/polars-time/src/chunkedarray/utf8/strptime.rs b/polars/polars-time/src/chunkedarray/utf8/strptime.rs index c2bb7320d2f5..bd601ef97603 100644 --- a/polars/polars-time/src/chunkedarray/utf8/strptime.rs +++ b/polars/polars-time/src/chunkedarray/utf8/strptime.rs @@ -8,8 +8,11 @@ use regex::Regex; use crate::chunkedarray::{polars_bail, PolarsResult}; -static HOUR_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[HI]").unwrap()); -static MINUTE_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%M").unwrap()); +static HOUR_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[_-]?[HkIl]").unwrap()); +static MINUTE_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[_-]?M").unwrap()); +static SECOND_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[_-]?S").unwrap()); +static TWELVE_HOUR_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[_-]?[Il]").unwrap()); +static MERIDIEM_PATTERN: Lazy = Lazy::new(|| Regex::new(r"%[_-]?[pP]").unwrap()); #[inline] fn update_and_parse( @@ -52,14 +55,23 @@ fn parse_month_abbrev(val: &[u8], offset: usize) -> Option<(u32, usize)> { /// E.g. chrono supports single letter date identifiers like %F, whereas polars only consumes /// year, day, month distinctively with %Y, %d, %m. pub(super) fn compile_fmt(fmt: &str) -> PolarsResult { - if HOUR_PATTERN.is_match(fmt) & !MINUTE_PATTERN.is_match(fmt) { - // (hopefully) temporary hack. Ideally, chrono would return a ParseKindError indicating - // if `fmt` is too long for NaiveDate. If that's implemented, then this check could - // be removed, and that error could be matched against in `transform_datetime_*s` - // See https://github.com/chronotope/chrono/issues/1075. - polars_bail!(ComputeError: "Invalid format string: found hour, but no minute directive. \ - Please either specify both or neither."); + // (hopefully) temporary hacks. Ideally, chrono would return a ParseKindError indicating + // if `fmt` is too long for NaiveDate. If that's implemented, then this check could + // be removed, and that error could be matched against in `transform_datetime_*s` + // See https://github.com/chronotope/chrono/issues/1075. + if HOUR_PATTERN.is_match(fmt) ^ MINUTE_PATTERN.is_match(fmt) { + polars_bail!(ComputeError: "Invalid format string: \ + Please either specify both hour and minute, or neither."); } + if SECOND_PATTERN.is_match(fmt) && !HOUR_PATTERN.is_match(fmt) { + polars_bail!(ComputeError: "Invalid format string: \ + Found seconds directive, but no hours directive."); + } + if TWELVE_HOUR_PATTERN.is_match(fmt) ^ MERIDIEM_PATTERN.is_match(fmt) { + polars_bail!(ComputeError: "Invalid format string: \ + Please either specify both 12-hour directive and meridiem directive, or neither."); + } + Ok(fmt .replace("%D", "%m/%d/%y") .replace("%R", "%H:%M") diff --git a/py-polars/polars/testing/parametric/strategies.py b/py-polars/polars/testing/parametric/strategies.py index 122d93c0a2d9..098ecfd84148 100644 --- a/py-polars/polars/testing/parametric/strategies.py +++ b/py-polars/polars/testing/parametric/strategies.py @@ -28,6 +28,7 @@ integers, lists, sampled_from, + sets, text, timedeltas, times, @@ -130,6 +131,40 @@ def strategy_decimal(draw: DrawFn) -> PyDecimal: ) +@composite +def strategy_datetime_format(draw: DrawFn) -> str: + """Draw a random datetime format string.""" + fmt = draw( + sets( + sampled_from( + [ + "%m", + "%b", + "%B", + "%d", + "%j", + "%a", + "%A", + "%w", + "%H", + "%I", + "%p", + "%M", + "%S", + "%U", + "%W", + "%%", + ] + ), + ) + ) + + # Make sure year is always present + fmt.add("%Y") + + return " ".join(fmt) + + class StrategyLookup(MutableMapping[PolarsDataType, SearchStrategy[Any]]): """ Mapping from polars DataTypes to hypothesis Strategies. diff --git a/py-polars/tests/parametric/time_series/test_to_datetime.py b/py-polars/tests/parametric/time_series/test_to_datetime.py new file mode 100644 index 000000000000..e9f42df0466c --- /dev/null +++ b/py-polars/tests/parametric/time_series/test_to_datetime.py @@ -0,0 +1,44 @@ +from datetime import datetime + +import hypothesis.strategies as st +from hypothesis import given + +import polars as pl +from polars.exceptions import ComputeError +from polars.testing.parametric.strategies import strategy_datetime_format + + +@given( + datetimes=st.datetimes( + min_value=datetime(2000, 1, 1), max_value=datetime(9999, 12, 31) + ), + fmt=strategy_datetime_format(), +) +def test_to_datetime(datetimes: datetime, fmt: str) -> None: + input = datetimes.strftime(fmt) + expected = datetime.strptime(input, fmt) + try: + result = pl.Series([input]).str.to_datetime(format=fmt).item() + except ComputeError as exc: + # If there's an exception, check that it's either: + # - something which polars can't parse at all: missing day or month + # - something on which polars intentionally raises + assert ( # noqa: PT017 + ( + (("%H" in fmt) ^ ("%M" in fmt)) + or (("%I" in fmt) ^ ("%M" in fmt)) + or ("%S" in fmt and "%H" not in fmt) + or ("%S" in fmt and "%I" not in fmt) + or (("%I" in fmt) ^ ("%p" in fmt)) + or (("%H" in fmt) ^ ("%p" in fmt)) + ) + and "Invalid format string" in str(exc) + ) or ( + ( + not any(day in fmt for day in ("%d", "%j")) + or not any(month in fmt for month in ("%b", "%B", "%m")) + ) + and "strict conversion to datetimes failed" in str(exc) + ) + else: + assert result == expected diff --git a/py-polars/tests/unit/namespaces/test_strptime.py b/py-polars/tests/unit/namespaces/test_strptime.py index 5c720886555e..beafc8e06394 100644 --- a/py-polars/tests/unit/namespaces/test_strptime.py +++ b/py-polars/tests/unit/namespaces/test_strptime.py @@ -528,12 +528,72 @@ def test_strptime_subseconds_datetime(data: str, format: str, expected: time) -> assert result == expected -def test_strptime_hour_without_minute_8849() -> None: +@pytest.mark.parametrize( + ("string", "fmt"), + [ + pytest.param("2023-05-04|7", "%Y-%m-%d|%H", id="hour but no minute"), + pytest.param("2023-05-04|7", "%Y-%m-%d|%k", id="padded hour but no minute"), + pytest.param("2023-05-04|10", "%Y-%m-%d|%M", id="minute but no hour"), + pytest.param("2023-05-04|10", "%Y-%m-%d|%S", id="second but no hour"), + pytest.param( + "2000-Jan-01 01 00 01", "%Y-%b-%d %I %M %S", id="12-hour clock but no AM/PM" + ), + pytest.param( + "2000-Jan-01 01 00 01", + "%Y-%b-%d %l %M %S", + id="padded 12-hour clock but no AM/PM", + ), + ], +) +def test_strptime_incomplete_formats(string: str, fmt: str) -> None: with pytest.raises( ComputeError, - match="Invalid format string: found hour, but no minute directive", + match="Invalid format string", ): - pl.Series(["2023-05-04|7", "2023-05-04|10"]).str.to_datetime("%Y-%m-%d|%H") + pl.Series([string]).str.to_datetime(fmt) + + +@pytest.mark.parametrize( + ("string", "fmt", "expected"), + [ + ("2023-05-04|7:3", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 7, 3)), + ("2023-05-04|10:03", "%Y-%m-%d|%H:%M", datetime(2023, 5, 4, 10, 3)), + ( + "2000-Jan-01 01 00 01 am", + "%Y-%b-%d %I %M %S %P", + datetime(2000, 1, 1, 1, 0, 1), + ), + ( + "2000-Jan-01 01 00 01 am", + "%Y-%b-%d %_I %M %S %P", + datetime(2000, 1, 1, 1, 0, 1), + ), + ( + "2000-Jan-01 01 00 01 am", + "%Y-%b-%d %l %M %S %P", + datetime(2000, 1, 1, 1, 0, 1), + ), + ( + "2000-Jan-01 01 00 01 AM", + "%Y-%b-%d %I %M %S %p", + datetime(2000, 1, 1, 1, 0, 1), + ), + ( + "2000-Jan-01 01 00 01 AM", + "%Y-%b-%d %_I %M %S %p", + datetime(2000, 1, 1, 1, 0, 1), + ), + ( + "2000-Jan-01 01 00 01 AM", + "%Y-%b-%d %l %M %S %p", + datetime(2000, 1, 1, 1, 0, 1), + ), + ], +) +def test_strptime_complete_formats(string: str, fmt: str, expected: datetime) -> None: + # Similar to the above, but these formats are complete and should work + result = pl.Series([string]).str.to_datetime(fmt).item() + assert result == expected @pytest.mark.parametrize(