From e90bb56c33edf4f96c872cecb5f3c591576ce2d9 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Sat, 1 Jun 2024 05:10:37 +0300 Subject: [PATCH 1/4] ENH: Warn when to_datetime falls back to dateutil when dayfirst is passed --- pandas/core/tools/datetimes.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index b01cdb335ec46..eaca0479ab931 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -135,10 +135,11 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str return guessed_format # If there are multiple non-null elements, warn about # how parsing might not be consistent - if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: + if dayfirst or tslib.first_non_null(arr[first_non_null + 1 :]) != -1: warnings.warn( "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil`. To ensure parsing is " + "individually, falling back to `dateutil` which does not take the " + "dayfirst parameter in consideration. To ensure parsing is " "consistent and as-expected, please specify a format.", UserWarning, stacklevel=find_stack_level(), From 82ac17b7db0be7af5f127f9f64dce558265460c7 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Sat, 1 Jun 2024 06:12:27 +0300 Subject: [PATCH 2/4] Assert warnings --- pandas/tests/tools/test_to_datetime.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index b05c30fa50fbe..4f9bd4a5a6a97 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3007,9 +3007,11 @@ def test_parsers_dayfirst_yearfirst( result2 = Timestamp(date_str) assert result2 == expected - result3 = to_datetime( - date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache - ) + warn = UserWarning if dayfirst else None + with tm.assert_produces_warning(warn, match="Could not infer format"): + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] From ed523efa1ba540c80b0dd5b9665f5ec2737b18cf Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Thu, 4 Jul 2024 18:52:36 +0300 Subject: [PATCH 3/4] Remove warnings and fix functionality --- pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/core/tools/datetimes.py | 5 ++--- pandas/tests/tools/test_to_datetime.py | 8 +++----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 3a55f5fa0c003..299ae3d6abe80 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -610,7 +610,7 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, ts, &dts, &out_bestunit, &out_local, &out_tzoffset, False ) - if not string_to_dts_failed: + if not string_to_dts_failed and not dayfirst: reso = get_supported_reso(out_bestunit) check_dts_bounds(&dts, reso) obj = _TSObject() diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index eaca0479ab931..b01cdb335ec46 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -135,11 +135,10 @@ def _guess_datetime_format_for_array(arr, dayfirst: bool | None = False) -> str return guessed_format # If there are multiple non-null elements, warn about # how parsing might not be consistent - if dayfirst or tslib.first_non_null(arr[first_non_null + 1 :]) != -1: + if tslib.first_non_null(arr[first_non_null + 1 :]) != -1: warnings.warn( "Could not infer format, so each element will be parsed " - "individually, falling back to `dateutil` which does not take the " - "dayfirst parameter in consideration. To ensure parsing is " + "individually, falling back to `dateutil`. To ensure parsing is " "consistent and as-expected, please specify a format.", UserWarning, stacklevel=find_stack_level(), diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 4f9bd4a5a6a97..b05c30fa50fbe 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -3007,11 +3007,9 @@ def test_parsers_dayfirst_yearfirst( result2 = Timestamp(date_str) assert result2 == expected - warn = UserWarning if dayfirst else None - with tm.assert_produces_warning(warn, match="Could not infer format"): - result3 = to_datetime( - date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache - ) + result3 = to_datetime( + date_str, dayfirst=dayfirst, yearfirst=yearfirst, cache=cache + ) result4 = DatetimeIndex([date_str], dayfirst=dayfirst, yearfirst=yearfirst)[0] From 4c275301f620105cb2a9b944a815f3e85e1f2bd5 Mon Sep 17 00:00:00 2001 From: Abdulaziz Aloqeely <52792999+Aloqeely@users.noreply.github.com> Date: Sun, 7 Jul 2024 11:02:20 +0300 Subject: [PATCH 4/4] Add whatsnew, write test --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/_libs/tslibs/conversion.pyx | 65 ++++++++++++++------------ pandas/_libs/tslibs/parsing.pyx | 49 +++++++++---------- pandas/tests/tools/test_to_datetime.py | 2 + 4 files changed, 63 insertions(+), 54 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index be4b9c218f9f5..371c9a2237c43 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -500,6 +500,7 @@ Datetimelike - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` does not raise on Custom business days frequencies bigger then "1C" (:issue:`58664`) - Bug in :meth:`DatetimeIndex.is_year_start` and :meth:`DatetimeIndex.is_quarter_start` returning ``False`` on double-digit frequencies (:issue:`58523`) - Bug in :meth:`DatetimeIndex.union` when ``unit`` was non-nanosecond (:issue:`59036`) +- Bug in :meth:`to_datetime` not respecting dayfirst if an uncommon date string was passed. (:issue:`58859`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) Timedelta diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 299ae3d6abe80..0fadbbbed2c72 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -606,37 +606,42 @@ cdef _TSObject convert_str_to_tsobject(str ts, tzinfo tz, # equiv: datetime.today().replace(tzinfo=tz) return convert_datetime_to_tsobject(dt, tz, nanos=0, reso=NPY_FR_us) else: - string_to_dts_failed = string_to_dts( - ts, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed and not dayfirst: - reso = get_supported_reso(out_bestunit) - check_dts_bounds(&dts, reso) - obj = _TSObject() - obj.dts = dts - obj.creso = reso - ival = npy_datetimestruct_to_datetime(reso, &dts) - - if out_local == 1: - obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) - obj.value = tz_localize_to_utc_single( - ival, obj.tzinfo, ambiguous="raise", nonexistent=None, creso=reso - ) - if tz is None: - check_overflows(obj, reso) - return obj - _adjust_tsobject_tz_using_offset(obj, tz) - return obj - else: - if tz is not None: - # shift for _localize_tso - ival = tz_localize_to_utc_single( - ival, tz, ambiguous="raise", nonexistent=None, creso=reso + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + ts, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + reso = get_supported_reso(out_bestunit) + check_dts_bounds(&dts, reso) + obj = _TSObject() + obj.dts = dts + obj.creso = reso + ival = npy_datetimestruct_to_datetime(reso, &dts) + + if out_local == 1: + obj.tzinfo = timezone(timedelta(minutes=out_tzoffset)) + obj.value = tz_localize_to_utc_single( + ival, + obj.tzinfo, + ambiguous="raise", + nonexistent=None, + creso=reso, ) - obj.value = ival - maybe_localize_tso(obj, tz, obj.creso) - return obj + if tz is None: + check_overflows(obj, reso) + return obj + _adjust_tsobject_tz_using_offset(obj, tz) + return obj + else: + if tz is not None: + # shift for _localize_tso + ival = tz_localize_to_utc_single( + ival, tz, ambiguous="raise", nonexistent=None, creso=reso + ) + obj.value = ival + maybe_localize_tso(obj, tz, obj.creso) + return obj dt = parse_datetime_string( ts, diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 35d2433a707a0..308183402198d 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -377,32 +377,33 @@ def parse_datetime_string_with_reso( raise ValueError(f'Given date string "{date_string}" not likely a datetime') # Try iso8601 first, as it handles nanoseconds - string_to_dts_failed = string_to_dts( - date_string, &dts, &out_bestunit, &out_local, - &out_tzoffset, False - ) - if not string_to_dts_failed: - # Match Timestamp and drop picoseconds, femtoseconds, attoseconds - # The new resolution will just be nano - # GH#50417 - if out_bestunit in _timestamp_units: - out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns - - if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: - # TODO: avoid circular import - from pandas import Timestamp - parsed = Timestamp(date_string) - else: - if out_local: - tz = timezone(timedelta(minutes=out_tzoffset)) + if not dayfirst: # GH 58859 + string_to_dts_failed = string_to_dts( + date_string, &dts, &out_bestunit, &out_local, + &out_tzoffset, False + ) + if not string_to_dts_failed: + # Match Timestamp and drop picoseconds, femtoseconds, attoseconds + # The new resolution will just be nano + # GH#50417 + if out_bestunit in _timestamp_units: + out_bestunit = NPY_DATETIMEUNIT.NPY_FR_ns + + if out_bestunit == NPY_DATETIMEUNIT.NPY_FR_ns: + # TODO: avoid circular import + from pandas import Timestamp + parsed = Timestamp(date_string) else: - tz = None - parsed = datetime_new( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz - ) + if out_local: + tz = timezone(timedelta(minutes=out_tzoffset)) + else: + tz = None + parsed = datetime_new( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz + ) - reso = npy_unit_to_attrname[out_bestunit] - return parsed, reso + reso = npy_unit_to_attrname[out_bestunit] + return parsed, reso parsed = _parse_delimited_date(date_string, dayfirst, &out_bestunit) if parsed is not None: diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index c1d6baaf17c92..3a47d87286711 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -2988,6 +2988,8 @@ def test_parsers_nat(self): ("20/12/21", True, False, datetime(2021, 12, 20)), ("20/12/21", False, True, datetime(2020, 12, 21)), ("20/12/21", True, True, datetime(2020, 12, 21)), + # GH 58859 + ("20201012", True, False, datetime(2020, 12, 10)), ], ) def test_parsers_dayfirst_yearfirst(