From 5d0f9a8330272a717f1ce2622b2929ba2e6c1422 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:07:24 +0200 Subject: [PATCH 01/14] CI: Fix ci for numpy 2 failures (#59545) --- pandas/plotting/_matplotlib/core.py | 2 +- pandas/tests/io/test_parquet.py | 4 ++++ pandas/tests/plotting/frame/test_frame.py | 11 +++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py index fb7d785a94bc4..9a7e563332a42 100644 --- a/pandas/plotting/_matplotlib/core.py +++ b/pandas/plotting/_matplotlib/core.py @@ -546,7 +546,7 @@ def _maybe_right_yaxis(self, ax: Axes, axes_num: int) -> Axes: new_ax.set_yscale("log") elif self.logy == "sym" or self.loglog == "sym": new_ax.set_yscale("symlog") - return new_ax # type: ignore[return-value] + return new_ax @final @cache_readonly diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 500393b716d9f..ec087eab0cf14 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1178,6 +1178,10 @@ def test_duplicate_columns(self, fp): msg = "Cannot create parquet dataset with duplicate column names" self.check_error_on_write(df, fp, ValueError, msg) + @pytest.mark.xfail( + Version(np.__version__) >= Version("2.0.0"), + reason="fastparquet uses np.float_ in numpy2", + ) def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index b381c4fce8430..b39f953da1ee6 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -45,6 +45,7 @@ _check_visible, get_y_axis, ) +from pandas.util.version import Version from pandas.io.formats.printing import pprint_thing @@ -2465,8 +2466,14 @@ def test_group_subplot_invalid_column_name(self): d = {"a": np.arange(10), "b": np.arange(10)} df = DataFrame(d) - with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): - df.plot(subplots=[("a", "bad_name")]) + if Version(np.__version__) < Version("2.0.0"): + with pytest.raises(ValueError, match=r"Column label\(s\) \['bad_name'\]"): + df.plot(subplots=[("a", "bad_name")]) + else: + with pytest.raises( + ValueError, match=r"Column label\(s\) \[np\.str\_\('bad_name'\)\]" + ): + df.plot(subplots=[("a", "bad_name")]) def test_group_subplot_duplicated_column(self): d = {"a": np.arange(10), "b": np.arange(10), "c": np.arange(10)} From eac20cc6da4c7ef5c29cb30edeb6ce8b99749d32 Mon Sep 17 00:00:00 2001 From: William Ayd Date: Mon, 19 Aug 2024 15:30:50 -0400 Subject: [PATCH 02/14] Fix get_datetimestruct_days overflow (#56001) * remove pandas/tests/frame UB * updated macro with filename / line num * fix typo * stay with PD_CHECK_OVERFLOW * updates * test fixes --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- .../src/vendored/numpy/datetime/np_datetime.c | 48 +++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c index f854f7b9210d8..cc65f34d6b6fe 100644 --- a/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c +++ b/pandas/_libs/src/vendored/numpy/datetime/np_datetime.c @@ -20,14 +20,12 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION #endif // NPY_NO_DEPRECATED_API -#include - #include "pandas/vendored/numpy/datetime/np_datetime.h" - #define NO_IMPORT_ARRAY #define PY_ARRAY_UNIQUE_SYMBOL PANDAS_DATETIME_NUMPY #include #include +#include #if defined(_WIN32) #ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS @@ -58,12 +56,15 @@ _Static_assert(0, "__has_builtin not detected; please try a newer compiler"); #endif #endif +#define XSTR(a) STR(a) +#define STR(a) #a + #define PD_CHECK_OVERFLOW(FUNC) \ do { \ if ((FUNC) != 0) { \ PyGILState_STATE gstate = PyGILState_Ensure(); \ PyErr_SetString(PyExc_OverflowError, \ - "Overflow occurred in npy_datetimestruct_to_datetime"); \ + "Overflow occurred at " __FILE__ ":" XSTR(__LINE__)); \ PyGILState_Release(gstate); \ return -1; \ } \ @@ -139,8 +140,8 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { npy_int64 year, days = 0; const int *month_lengths; - year = dts->year - 1970; - days = year * 365; + PD_CHECK_OVERFLOW(checked_int64_sub(dts->year, 1970, &year)); + PD_CHECK_OVERFLOW(checked_int64_mul(year, 365, &days)); /* Adjust for leap years */ if (days >= 0) { @@ -148,32 +149,32 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { * 1968 is the closest leap year before 1970. * Exclude the current year, so add 1. */ - year += 1; + PD_CHECK_OVERFLOW(checked_int64_add(year, 1, &year)); /* Add one day for each 4 years */ - days += year / 4; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); /* 1900 is the closest previous year divisible by 100 */ - year += 68; + PD_CHECK_OVERFLOW(checked_int64_add(year, 68, &year)); /* Subtract one day for each 100 years */ - days -= year / 100; + PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); /* 1600 is the closest previous year divisible by 400 */ - year += 300; + PD_CHECK_OVERFLOW(checked_int64_add(year, 300, &year)); /* Add one day for each 400 years */ - days += year / 400; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); } else { /* * 1972 is the closest later year after 1970. * Include the current year, so subtract 2. */ - year -= 2; + PD_CHECK_OVERFLOW(checked_int64_sub(year, 2, &year)); /* Subtract one day for each 4 years */ - days += year / 4; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 4, &days)); /* 2000 is the closest later year divisible by 100 */ - year -= 28; + PD_CHECK_OVERFLOW(checked_int64_sub(year, 28, &year)); /* Add one day for each 100 years */ - days -= year / 100; + PD_CHECK_OVERFLOW(checked_int64_sub(days, year / 100, &days)); /* 2000 is also the closest later year divisible by 400 */ /* Subtract one day for each 400 years */ - days += year / 400; + PD_CHECK_OVERFLOW(checked_int64_add(days, year / 400, &days)); } month_lengths = days_per_month_table[is_leapyear(dts->year)]; @@ -181,11 +182,11 @@ npy_int64 get_datetimestruct_days(const npy_datetimestruct *dts) { /* Add the months */ for (i = 0; i < month; ++i) { - days += month_lengths[i]; + PD_CHECK_OVERFLOW(checked_int64_add(days, month_lengths[i], &days)); } /* Add the days */ - days += dts->day - 1; + PD_CHECK_OVERFLOW(checked_int64_add(days, dts->day - 1, &days)); return days; } @@ -430,6 +431,15 @@ npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base, } const int64_t days = get_datetimestruct_days(dts); + if (days == -1) { + PyGILState_STATE gstate = PyGILState_Ensure(); + bool did_error = PyErr_Occurred() == NULL ? false : true; + PyGILState_Release(gstate); + if (did_error) { + return -1; + } + } + if (base == NPY_FR_D) { return days; } From 3097190535ab4eb589b8b3b391fc9ee9437546cb Mon Sep 17 00:00:00 2001 From: Eduard Akhmetshin Date: Mon, 19 Aug 2024 20:32:45 +0100 Subject: [PATCH 03/14] TST: Test non-nanosecond datetimes in PyArrow Parquet dataframes (#59393) * Add test_non_nanosecond_timestamps * Fix lint errors * Remove detailed comment; use temp path helper * Use temp_file * Apply suggestions from code review Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Remove extra empty line * Skip test in minimal version env --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/tests/io/test_parquet.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ec087eab0cf14..f4d64bf84b3f5 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1137,6 +1137,21 @@ def test_infer_string_large_string_type(self, tmp_path, pa): # assert result["strings"].dtype == "string" # FIXME: don't leave commented-out + def test_non_nanosecond_timestamps(self, temp_file): + # GH#49236 + pa = pytest.importorskip("pyarrow", "11.0.0") + pq = pytest.importorskip("pyarrow.parquet") + + arr = pa.array([datetime.datetime(1600, 1, 1)], type=pa.timestamp("us")) + table = pa.table([arr], names=["timestamp"]) + pq.write_table(table, temp_file) + result = read_parquet(temp_file) + expected = pd.DataFrame( + data={"timestamp": [datetime.datetime(1600, 1, 1)]}, + dtype="datetime64[us]", + ) + tm.assert_frame_equal(result, expected) + class TestParquetFastParquet(Base): @pytest.mark.xfail(reason="datetime_with_nat gets incorrect values") From ca2b8c30756a48a22ed3249dc8914680fa948180 Mon Sep 17 00:00:00 2001 From: Denis Matsiusheuski <60542384+matsidzi@users.noreply.github.com> Date: Mon, 19 Aug 2024 22:59:34 +0300 Subject: [PATCH 04/14] DOC: Fixed bug number in what's new v3.0.0 (#59539) Fixed bug number in what's new v3.0.0 --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index f25edd39cf7da..57b85b933f020 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -543,7 +543,7 @@ Datetimelike - Bug in :attr:`is_year_start` where a DateTimeIndex constructed via a date_range with frequency 'MS' wouldn't have the correct year or quarter start attributes (:issue:`57377`) - Bug in :class:`Timestamp` constructor failing to raise when ``tz=None`` is explicitly specified in conjunction with timezone-aware ``tzinfo`` or data (:issue:`48688`) - Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`) -- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56382`) +- Bug in :func:`date_range` where using a negative frequency value would not include all points between the start and end values (:issue:`56147`) - Bug in :func:`tseries.api.guess_datetime_format` would fail to infer time format when "%Y" == "%H%M" (:issue:`57452`) - Bug in :func:`tseries.frequencies.to_offset` would fail to parse frequency strings starting with "LWOM" (:issue:`59218`) - Bug in :meth:`Dataframe.agg` with df with missing values resulting in IndexError (:issue:`58810`) From 64aa9be186b90af5809f7d3d34a89a8e24162d51 Mon Sep 17 00:00:00 2001 From: Abhinav Reddy Date: Mon, 19 Aug 2024 16:43:33 -0400 Subject: [PATCH 05/14] DOC: Fix Numpy Docstring validation errors in pandas.api.extensions.ExtensionArray (#59540) * Fix SA01 for isna * Fix See Also for nbytes * fix See also for ndim * Fix ravel * fis return value of take * remove type ignore --------- Co-authored-by: Abhinav Thimma Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 5 ----- pandas/core/arrays/base.py | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index dbfe0230d835c..e188a9d1733b9 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -224,11 +224,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.api.extensions.ExtensionArray.insert PR07,RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.interpolate PR01,SA01" \ -i "pandas.api.extensions.ExtensionArray.isin PR07,RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.isna SA01" \ - -i "pandas.api.extensions.ExtensionArray.nbytes SA01" \ - -i "pandas.api.extensions.ExtensionArray.ndim SA01" \ - -i "pandas.api.extensions.ExtensionArray.ravel RT03,SA01" \ - -i "pandas.api.extensions.ExtensionArray.take RT03" \ -i "pandas.api.extensions.ExtensionArray.tolist RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.unique RT03,SA01" \ -i "pandas.api.extensions.ExtensionArray.view SA01" \ diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b429b7c1b1fc4..a0c318409d6bb 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -649,6 +649,11 @@ def ndim(self) -> int: """ Extension Arrays are only allowed to be 1-dimensional. + See Also + -------- + ExtensionArray.shape: Return a tuple of the array dimensions. + ExtensionArray.size: The number of elements in the array. + Examples -------- >>> arr = pd.array([1, 2, 3]) @@ -662,6 +667,11 @@ def nbytes(self) -> int: """ The number of bytes needed to store this object in memory. + See Also + -------- + ExtensionArray.shape: Return a tuple of the array dimensions. + ExtensionArray.size: The number of elements in the array. + Examples -------- >>> pd.array([1, 2, 3]).nbytes @@ -767,6 +777,11 @@ def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: an ndarray would be expensive, an ExtensionArray may be returned. + See Also + -------- + ExtensionArray.dropna: Return ExtensionArray without NA values. + ExtensionArray.fillna: Fill NA/NaN values using the specified method. + Notes ----- If returning an ExtensionArray, then @@ -1580,6 +1595,7 @@ def take( Returns ------- ExtensionArray + An array formed with selected `indices`. Raises ------ @@ -1832,6 +1848,11 @@ def ravel(self, order: Literal["C", "F", "A", "K"] | None = "C") -> Self: Returns ------- ExtensionArray + A flattened view on the array. + + See Also + -------- + ExtensionArray.tolist: Return a list of the values. Notes ----- From c4467a9d69cc7e2cf2e7aa241ddca39c518a9c6d Mon Sep 17 00:00:00 2001 From: UDIT BALIYAN <130930448+uditbaliyan@users.noreply.github.com> Date: Tue, 20 Aug 2024 02:15:24 +0530 Subject: [PATCH 06/14] Fix docstring timestamps#59458 (#59533) * Fix docstrings for pandas.Period.year SA01 * Fix docstrings for pandas.Period.year SA01 Validate Docstrings err * Fix docstrings for pandas.Period.year SA01 Validate Docstrings err * Fix docstrings for pandas.timestamps SA01 PR01 * This reverts commit f52c61080a331597197711465742ace1bdfcd1dd. * Revert "Removed dateparseerror example from period.year" This reverts commit cdf36b2faa6c95848d6210c801ee287f0b7084fd. * fixing merge conflict * fix pre-commit.ci err * fix pre-commit.ci err * fix Docstring validation err * fix Docstring validation err * fix Docstring validation err * fix timestamp Docstring validation err * pandas/plotting/_matplotlib/core.py:549: error: Unused "type: ignore" comment [unused-ignore] --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- ci/code_checks.sh | 16 -- pandas/_libs/tslibs/nattype.pyx | 223 ++++++++++++++++++++++-- pandas/_libs/tslibs/timestamps.pyx | 270 ++++++++++++++++++++++++++++- 3 files changed, 470 insertions(+), 39 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index e188a9d1733b9..ac163a3408c25 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -180,36 +180,20 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.TimedeltaIndex.nanoseconds SA01" \ -i "pandas.TimedeltaIndex.seconds SA01" \ -i "pandas.TimedeltaIndex.to_pytimedelta RT03,SA01" \ - -i "pandas.Timestamp.combine PR01,SA01" \ - -i "pandas.Timestamp.ctime SA01" \ - -i "pandas.Timestamp.date SA01" \ -i "pandas.Timestamp.day GL08" \ -i "pandas.Timestamp.fold GL08" \ - -i "pandas.Timestamp.fromordinal SA01" \ - -i "pandas.Timestamp.fromtimestamp PR01,SA01" \ -i "pandas.Timestamp.hour GL08" \ -i "pandas.Timestamp.max PR02" \ -i "pandas.Timestamp.microsecond GL08" \ -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.minute GL08" \ -i "pandas.Timestamp.month GL08" \ - -i "pandas.Timestamp.month_name SA01" \ -i "pandas.Timestamp.nanosecond GL08" \ - -i "pandas.Timestamp.normalize SA01" \ - -i "pandas.Timestamp.quarter SA01" \ - -i "pandas.Timestamp.replace PR07,SA01" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.second GL08" \ -i "pandas.Timestamp.strptime PR01,SA01" \ - -i "pandas.Timestamp.timestamp SA01" \ - -i "pandas.Timestamp.timetuple SA01" \ -i "pandas.Timestamp.timetz SA01" \ -i "pandas.Timestamp.to_datetime64 SA01" \ - -i "pandas.Timestamp.to_julian_date SA01" \ - -i "pandas.Timestamp.to_numpy PR01" \ - -i "pandas.Timestamp.to_period PR01,SA01" \ - -i "pandas.Timestamp.today SA01" \ - -i "pandas.Timestamp.toordinal SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.value GL08" \ -i "pandas.Timestamp.year GL08" \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 547f95badab53..f5d671c5c42e9 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -244,16 +244,24 @@ cdef class _NaT(datetime): def to_numpy(self, dtype=None, copy=False) -> np.datetime64 | np.timedelta64: """ - Convert the Timestamp to a NumPy datetime64 or timedelta64. + Convert the Timestamp to a NumPy datetime64. - With the default 'dtype', this is an alias method for `NaT.to_datetime64()`. - - The copy parameter is available here only for compatibility. Its value + This is an alias method for `Timestamp.to_datetime64()`. The dtype and + copy parameters are available here only for compatibility. Their values will not affect the return value. + Parameters + ---------- + dtype : dtype, optional + Data type of the output, ignored in this method as the return type + is always `numpy.datetime64`. + copy : bool, default False + Whether to ensure that the returned value is a new object. This + parameter is also ignored as the method does not support copying. + Returns ------- - numpy.datetime64 or numpy.timedelta64 + numpy.datetime64 See Also -------- @@ -269,9 +277,6 @@ cdef class _NaT(datetime): >>> pd.NaT.to_numpy() numpy.datetime64('NaT') - - >>> pd.NaT.to_numpy("m8[ns]") - numpy.timedelta64('NaT','ns') """ if dtype is not None: # GH#44460 @@ -476,6 +481,11 @@ class NaTType(_NaT): """ Return the month name of the Timestamp with specified locale. + This method returns the full name of the month corresponding to the + `Timestamp`, such as 'January', 'February', etc. The month name can + be returned in a specified locale if provided; otherwise, it defaults + to the English locale. + Parameters ---------- locale : str, default None (English locale) @@ -484,9 +494,18 @@ class NaTType(_NaT): Returns ------- str + The full month name as a string. + + See Also + -------- + Timestamp.day_name : Returns the name of the day of the week. + Timestamp.strftime : Returns a formatted string of the Timestamp. + datetime.datetime.strftime : Returns a string representing the date and time. Examples -------- + Get the month name in English (default): + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> ts.month_name() 'March' @@ -581,10 +600,25 @@ class NaTType(_NaT): date = _make_nat_func( "date", """ - Return date object with same year, month and day. + Returns `datetime.date` with the same year, month, and day. + + This method extracts the date component from the `Timestamp` and returns + it as a `datetime.date` object, discarding the time information. + + Returns + ------- + datetime.date + The date part of the `Timestamp`. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.date : Extract the date component from a `datetime` object. Examples -------- + Extract the date from a Timestamp: + >>> ts = pd.Timestamp('2023-01-01 10:00:00.00') >>> ts Timestamp('2023-01-01 10:00:00') @@ -704,6 +738,17 @@ class NaTType(_NaT): """ Return time tuple, compatible with time.localtime(). + This method converts the `Timestamp` into a time tuple, which is compatible + with functions like `time.localtime()`. The time tuple is a named tuple with + attributes such as year, month, day, hour, minute, second, weekday, + day of the year, and daylight savings indicator. + + See Also + -------- + time.localtime : Converts a POSIX timestamp into a time tuple. + Timestamp : The `Timestamp` that represents a specific point in time. + datetime.datetime.timetuple : Equivalent method in the `datetime` module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -733,6 +778,17 @@ class NaTType(_NaT): """ Return proleptic Gregorian ordinal. January 1 of year 1 is day 1. + The proleptic Gregorian ordinal is a continuous count of days since + January 1 of year 1, which is considered day 1. This method converts + the `Timestamp` to its equivalent ordinal number, useful for date arithmetic + and comparison operations. + + See Also + -------- + datetime.datetime.toordinal : Equivalent method in the `datetime` module. + Timestamp : The `Timestamp` that represents a specific point in time. + Timestamp.fromordinal : Create a `Timestamp` from an ordinal. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:50') @@ -745,7 +801,25 @@ class NaTType(_NaT): ctime = _make_error_func( "ctime", """ - Return ctime() style string. + Return a ctime() style string representing the Timestamp. + + This method returns a string representing the date and time + in the format returned by the standard library's `time.ctime()` + function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'. + + If the `Timestamp` is outside the range supported by Python's + standard library, a `NotImplementedError` is raised. + + Returns + ------- + str + A string representing the Timestamp in ctime format. + + See Also + -------- + time.ctime : Return a string representing time in ctime format. + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.ctime : Return a ctime style string from a datetime object. Examples -------- @@ -834,16 +908,43 @@ class NaTType(_NaT): fromtimestamp = _make_error_func( "fromtimestamp", """ - Timestamp.fromtimestamp(ts) + Create a `Timestamp` object from a POSIX timestamp. + + This method converts a POSIX timestamp (the number of seconds since + January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting + `Timestamp` can be localized to a specific time zone if provided. - Transform timestamp[, tz] to tz's local time from POSIX timestamp. + Parameters + ---------- + ts : float + The POSIX timestamp to convert, representing seconds since + the epoch (1970-01-01 00:00:00 UTC). + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional + Time zone for the `Timestamp`. If not provided, the `Timestamp` will + be timezone-naive (i.e., without time zone information). + + Returns + ------- + Timestamp + A `Timestamp` object representing the given POSIX timestamp. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp. Examples -------- + Convert a POSIX timestamp to a `Timestamp`: + >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP Timestamp('2020-03-14 15:32:52') - Note that the output may change depending on your local time. + Note that the output may change depending on your local time and time zone: + + >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP + Timestamp('2020-03-14 15:32:52+0000', tz='UTC') """, ) combine = _make_error_func( @@ -851,7 +952,28 @@ class NaTType(_NaT): """ Timestamp.combine(date, time) - Combine date, time into datetime with same date and time fields. + Combine a date and time into a single Timestamp object. + + This method takes a `date` object and a `time` object + and combines them into a single `Timestamp` + that has the same date and time fields. + + Parameters + ---------- + date : datetime.date + The date part of the Timestamp. + time : datetime.time + The time part of the Timestamp. + + Returns + ------- + Timestamp + A new `Timestamp` object representing the combined date and time. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. Examples -------- @@ -891,6 +1013,23 @@ class NaTType(_NaT): """ Return POSIX timestamp as float. + This method converts the `Timestamp` object to a POSIX timestamp, which is + the number of seconds since the Unix epoch (January 1, 1970). The returned + value is a floating-point number, where the integer part represents the + seconds, and the fractional part represents the microseconds. + + Returns + ------- + float + The POSIX timestamp representation of the `Timestamp` object. + + See Also + -------- + Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp. + datetime.datetime.timestamp : Equivalent method from the `datetime` module. + Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object. + Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') @@ -962,6 +1101,11 @@ class NaTType(_NaT): """ Construct a timestamp from a a proleptic Gregorian ordinal. + This method creates a `Timestamp` object corresponding to the given + proleptic Gregorian ordinal, which is a count of days from January 1, + 0001 (using the proleptic Gregorian calendar). The time part of the + `Timestamp` is set to midnight (00:00:00) by default. + Parameters ---------- ordinal : int @@ -969,14 +1113,31 @@ class NaTType(_NaT): tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Returns + ------- + Timestamp + A `Timestamp` object representing the specified ordinal date. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + Notes ----- By definition there cannot be any tz info on the ordinal itself. Examples -------- + Convert an ordinal to a `Timestamp`: + >>> pd.Timestamp.fromordinal(737425) Timestamp('2020-01-01 00:00:00') + + Create a `Timestamp` from an ordinal with timezone information: + + >>> pd.Timestamp.fromordinal(737425, tz='UTC') + Timestamp('2020-01-01 00:00:00+0000', tz='UTC') """, ) @@ -1068,6 +1229,12 @@ class NaTType(_NaT): tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + datetime.datetime.today : Returns the current local date. + Timestamp.now : Returns current time with optional timezone. + Timestamp : A class representing a specific timestamp. + Examples -------- >>> pd.Timestamp.today() # doctest: +SKIP @@ -1518,22 +1685,48 @@ default 'raise' """ Implements datetime.replace, handles nanoseconds. + This method creates a new `Timestamp` object by replacing the specified + fields with new values. The new `Timestamp` retains the original fields + that are not explicitly replaced. This method handles nanoseconds, and + the `tzinfo` parameter allows for timezone replacement without conversion. + Parameters ---------- year : int, optional + The year to replace. If `None`, the year is not changed. month : int, optional + The month to replace. If `None`, the month is not changed. day : int, optional + The day to replace. If `None`, the day is not changed. hour : int, optional + The hour to replace. If `None`, the hour is not changed. minute : int, optional + The minute to replace. If `None`, the minute is not changed. second : int, optional + The second to replace. If `None`, the second is not changed. microsecond : int, optional + The microsecond to replace. If `None`, the microsecond is not changed. nanosecond : int, optional + The nanosecond to replace. If `None`, the nanosecond is not changed. tzinfo : tz-convertible, optional + The timezone information to replace. If `None`, the timezone is not changed. fold : int, optional + The fold information to replace. If `None`, the fold is not changed. Returns ------- - Timestamp with fields replaced + Timestamp + A new `Timestamp` object with the specified fields replaced. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + + Notes + ----- + The `replace` method does not perform timezone conversions. If you need + to convert the timezone, use the `tz_convert` method instead. Examples -------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 47e5af14460a8..494dbecfe86b5 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -815,6 +815,11 @@ cdef class _Timestamp(ABCTimestamp): """ Return the month name of the Timestamp with specified locale. + This method returns the full name of the month corresponding to the + `Timestamp`, such as 'January', 'February', etc. The month name can + be returned in a specified locale if provided; otherwise, it defaults + to the English locale. + Parameters ---------- locale : str, default None (English locale) @@ -823,9 +828,18 @@ cdef class _Timestamp(ABCTimestamp): Returns ------- str + The full month name as a string. + + See Also + -------- + Timestamp.day_name : Returns the name of the day of the week. + Timestamp.strftime : Returns a formatted string of the Timestamp. + datetime.datetime.strftime : Returns a string representing the date and time. Examples -------- + Get the month name in English (default): + >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') >>> ts.month_name() 'March' @@ -912,17 +926,38 @@ cdef class _Timestamp(ABCTimestamp): @property def quarter(self) -> int: """ - Return the quarter of the year. + Return the quarter of the year for the `Timestamp`. + + This property returns an integer representing the quarter of the year in + which the `Timestamp` falls. The quarters are defined as follows: + - Q1: January 1 to March 31 + - Q2: April 1 to June 30 + - Q3: July 1 to September 30 + - Q4: October 1 to December 31 Returns ------- int + The quarter of the year (1 through 4). + + See Also + -------- + Timestamp.month : Returns the month of the `Timestamp`. + Timestamp.year : Returns the year of the `Timestamp`. Examples -------- + Get the quarter for a `Timestamp`: + >>> ts = pd.Timestamp(2020, 3, 14) >>> ts.quarter 1 + + For a `Timestamp` in the fourth quarter: + + >>> ts = pd.Timestamp(2020, 10, 14) + >>> ts.quarter + 4 """ return ((self.month - 1) // 3) + 1 @@ -977,6 +1012,21 @@ cdef class _Timestamp(ABCTimestamp): """ Normalize Timestamp to midnight, preserving tz information. + This method sets the time component of the `Timestamp` to midnight (00:00:00), + while preserving the date and time zone information. It is useful when you + need to standardize the time across different `Timestamp` objects without + altering the time zone or the date. + + Returns + ------- + Timestamp + + See Also + -------- + Timestamp.floor : Rounds `Timestamp` down to the nearest frequency. + Timestamp.ceil : Rounds `Timestamp` up to the nearest frequency. + Timestamp.round : Rounds `Timestamp` to the nearest frequency. + Examples -------- >>> ts = pd.Timestamp(2020, 3, 14, 15, 30) @@ -1212,6 +1262,23 @@ cdef class _Timestamp(ABCTimestamp): """ Return POSIX timestamp as float. + This method converts the `Timestamp` object to a POSIX timestamp, which is + the number of seconds since the Unix epoch (January 1, 1970). The returned + value is a floating-point number, where the integer part represents the + seconds, and the fractional part represents the microseconds. + + Returns + ------- + float + The POSIX timestamp representation of the `Timestamp` object. + + See Also + -------- + Timestamp.fromtimestamp : Construct a `Timestamp` from a POSIX timestamp. + datetime.datetime.timestamp : Equivalent method from the `datetime` module. + Timestamp.to_pydatetime : Convert the `Timestamp` to a `datetime` object. + Timestamp.to_datetime64 : Converts `Timestamp` to `numpy.datetime64`. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548') @@ -1298,6 +1365,15 @@ cdef class _Timestamp(ABCTimestamp): copy parameters are available here only for compatibility. Their values will not affect the return value. + Parameters + ---------- + dtype : dtype, optional + Data type of the output, ignored in this method as the return type + is always `numpy.datetime64`. + copy : bool, default False + Whether to ensure that the returned value is a new object. This + parameter is also ignored as the method does not support copying. + Returns ------- numpy.datetime64 @@ -1327,6 +1403,21 @@ cdef class _Timestamp(ABCTimestamp): """ Return an period of which this timestamp is an observation. + This method converts the given Timestamp to a Period object, + which represents a span of time,such as a year, month, etc., + based on the specified frequency. + + Parameters + ---------- + freq : str, optional + Frequency string for the period (e.g., 'Y', 'M', 'W'). Defaults to `None`. + + See Also + -------- + Timestamp : Represents a specific timestamp. + Period : Represents a span of time. + to_period : Converts an object to a Period. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52.192548651') @@ -1464,6 +1555,11 @@ class Timestamp(_Timestamp): """ Construct a timestamp from a a proleptic Gregorian ordinal. + This method creates a `Timestamp` object corresponding to the given + proleptic Gregorian ordinal, which is a count of days from January 1, + 0001 (using the proleptic Gregorian calendar). The time part of the + `Timestamp` is set to midnight (00:00:00) by default. + Parameters ---------- ordinal : int @@ -1471,14 +1567,31 @@ class Timestamp(_Timestamp): tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile or None Time zone for the Timestamp. + Returns + ------- + Timestamp + A `Timestamp` object representing the specified ordinal date. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + Notes ----- By definition there cannot be any tz info on the ordinal itself. Examples -------- + Convert an ordinal to a `Timestamp`: + >>> pd.Timestamp.fromordinal(737425) Timestamp('2020-01-01 00:00:00') + + Create a `Timestamp` from an ordinal with timezone information: + + >>> pd.Timestamp.fromordinal(737425, tz='UTC') + Timestamp('2020-01-01 00:00:00+0000', tz='UTC') """ return cls(datetime.fromordinal(ordinal), tz=tz) @@ -1529,6 +1642,12 @@ class Timestamp(_Timestamp): tz : str or timezone object, default None Timezone to localize to. + See Also + -------- + datetime.datetime.today : Returns the current local date. + Timestamp.now : Returns current time with optional timezone. + Timestamp : A class representing a specific timestamp. + Examples -------- >>> pd.Timestamp.today() # doctest: +SKIP @@ -1621,16 +1740,43 @@ class Timestamp(_Timestamp): @classmethod def fromtimestamp(cls, ts, tz=None): """ - Timestamp.fromtimestamp(ts) + Create a `Timestamp` object from a POSIX timestamp. - Transform timestamp[, tz] to tz's local time from POSIX timestamp. + This method converts a POSIX timestamp (the number of seconds since + January 1, 1970, 00:00:00 UTC) into a `Timestamp` object. The resulting + `Timestamp` can be localized to a specific time zone if provided. + + Parameters + ---------- + ts : float + The POSIX timestamp to convert, representing seconds since + the epoch (1970-01-01 00:00:00 UTC). + tz : str, zoneinfo.ZoneInfo, pytz.timezone, dateutil.tz.tzfile, optional + Time zone for the `Timestamp`. If not provided, the `Timestamp` will + be timezone-naive (i.e., without time zone information). + + Returns + ------- + Timestamp + A `Timestamp` object representing the given POSIX timestamp. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + datetime.datetime.fromtimestamp : Returns a datetime from a POSIX timestamp. Examples -------- + Convert a POSIX timestamp to a `Timestamp`: + >>> pd.Timestamp.fromtimestamp(1584199972) # doctest: +SKIP Timestamp('2020-03-14 15:32:52') - Note that the output may change depending on your local time. + Note that the output may change depending on your local time and time zone: + + >>> pd.Timestamp.fromtimestamp(1584199972, tz='UTC') # doctest: +SKIP + Timestamp('2020-03-14 15:32:52+0000', tz='UTC') """ tz = maybe_get_tz(tz) return cls(datetime.fromtimestamp(ts, tz)) @@ -1673,7 +1819,25 @@ class Timestamp(_Timestamp): def ctime(self): """ - Return ctime() style string. + Return a ctime() style string representing the Timestamp. + + This method returns a string representing the date and time + in the format returned by the standard library's `time.ctime()` + function, which is typically in the form 'Day Mon DD HH:MM:SS YYYY'. + + If the `Timestamp` is outside the range supported by Python's + standard library, a `NotImplementedError` is raised. + + Returns + ------- + str + A string representing the Timestamp in ctime format. + + See Also + -------- + time.ctime : Return a string representing time in ctime format. + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.ctime : Return a ctime style string from a datetime object. Examples -------- @@ -1698,10 +1862,25 @@ class Timestamp(_Timestamp): def date(self): """ - Return date object with same year, month and day. + Returns `datetime.date` with the same year, month, and day. + + This method extracts the date component from the `Timestamp` and returns + it as a `datetime.date` object, discarding the time information. + + Returns + ------- + datetime.date + The date part of the `Timestamp`. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + datetime.datetime.date : Extract the date component from a `datetime` object. Examples -------- + Extract the date from a Timestamp: + >>> ts = pd.Timestamp('2023-01-01 10:00:00.00') >>> ts Timestamp('2023-01-01 10:00:00') @@ -1879,6 +2058,17 @@ class Timestamp(_Timestamp): """ Return time tuple, compatible with time.localtime(). + This method converts the `Timestamp` into a time tuple, which is compatible + with functions like `time.localtime()`. The time tuple is a named tuple with + attributes such as year, month, day, hour, minute, second, weekday, + day of the year, and daylight savings indicator. + + See Also + -------- + time.localtime : Converts a POSIX timestamp into a time tuple. + Timestamp : The `Timestamp` that represents a specific point in time. + datetime.datetime.timetuple : Equivalent method in the `datetime` module. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00') @@ -1917,6 +2107,17 @@ class Timestamp(_Timestamp): """ Return proleptic Gregorian ordinal. January 1 of year 1 is day 1. + The proleptic Gregorian ordinal is a continuous count of days since + January 1 of year 1, which is considered day 1. This method converts + the `Timestamp` to its equivalent ordinal number, useful for date arithmetic + and comparison operations. + + See Also + -------- + datetime.datetime.toordinal : Equivalent method in the `datetime` module. + Timestamp : The `Timestamp` that represents a specific point in time. + Timestamp.fromordinal : Create a `Timestamp` from an ordinal. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:50') @@ -1960,7 +2161,28 @@ class Timestamp(_Timestamp): """ Timestamp.combine(date, time) - Combine date, time into datetime with same date and time fields. + Combine a date and time into a single Timestamp object. + + This method takes a `date` object and a `time` object + and combines them into a single `Timestamp` + that has the same date and time fields. + + Parameters + ---------- + date : datetime.date + The date part of the Timestamp. + time : datetime.time + The time part of the Timestamp. + + Returns + ------- + Timestamp + A new `Timestamp` object representing the combined date and time. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. Examples -------- @@ -2687,22 +2909,48 @@ default 'raise' """ Implements datetime.replace, handles nanoseconds. + This method creates a new `Timestamp` object by replacing the specified + fields with new values. The new `Timestamp` retains the original fields + that are not explicitly replaced. This method handles nanoseconds, and + the `tzinfo` parameter allows for timezone replacement without conversion. + Parameters ---------- year : int, optional + The year to replace. If `None`, the year is not changed. month : int, optional + The month to replace. If `None`, the month is not changed. day : int, optional + The day to replace. If `None`, the day is not changed. hour : int, optional + The hour to replace. If `None`, the hour is not changed. minute : int, optional + The minute to replace. If `None`, the minute is not changed. second : int, optional + The second to replace. If `None`, the second is not changed. microsecond : int, optional + The microsecond to replace. If `None`, the microsecond is not changed. nanosecond : int, optional + The nanosecond to replace. If `None`, the nanosecond is not changed. tzinfo : tz-convertible, optional + The timezone information to replace. If `None`, the timezone is not changed. fold : int, optional + The fold information to replace. If `None`, the fold is not changed. Returns ------- - Timestamp with fields replaced + Timestamp + A new `Timestamp` object with the specified fields replaced. + + See Also + -------- + Timestamp : Represents a single timestamp, similar to `datetime`. + to_datetime : Converts various types of data to datetime. + + Notes + ----- + The `replace` method does not perform timezone conversions. If you need + to convert the timezone, use the `tz_convert` method instead. Examples -------- @@ -2827,6 +3075,12 @@ default 'raise' 0 Julian date is noon January 1, 4713 BC. + See Also + -------- + Timestamp.toordinal : Return proleptic Gregorian ordinal. + Timestamp.timestamp : Return POSIX timestamp as float. + Timestamp : Represents a single timestamp. + Examples -------- >>> ts = pd.Timestamp('2020-03-14T15:32:52') From 7c726e9997bcb3f2802a43047ab5564707deaa74 Mon Sep 17 00:00:00 2001 From: aram-cinnamon <97805700+aram-cinnamon@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:08:18 -0400 Subject: [PATCH 07/14] BUG: `query` on columns with characters like # in its name (#59296) --- .pre-commit-config.yaml | 4 +- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/computation/parsing.py | 133 +++++++++++++++++--- pandas/core/frame.py | 14 +-- pandas/tests/frame/test_query_eval.py | 167 ++++++++++++++++++++++++-- 5 files changed, 277 insertions(+), 42 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b81b9ba070a44..f6717dd503c9b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -23,6 +23,7 @@ repos: hooks: - id: ruff args: [--exit-non-zero-on-fix] + exclude: ^pandas/tests/frame/test_query_eval.py - id: ruff # TODO: remove autofixe-only rules when they are checked by ruff name: ruff-selected-autofixes @@ -31,7 +32,7 @@ repos: exclude: ^pandas/tests args: [--select, "ANN001,ANN2", --fix-only, --exit-non-zero-on-fix] - id: ruff-format - exclude: ^scripts + exclude: ^scripts|^pandas/tests/frame/test_query_eval.py - repo: https://github.com/jendrikseipp/vulture rev: 'v2.11' hooks: @@ -85,6 +86,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - id: rst-inline-touching-normal + exclude: ^pandas/tests/frame/test_query_eval.py types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/sphinx-contrib/sphinx-lint diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 57b85b933f020..bdeb9c48990a2 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -685,6 +685,7 @@ Other - Bug in :meth:`DataFrame.apply` where passing ``engine="numba"`` ignored ``args`` passed to the applied function (:issue:`58712`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which caused an exception when using NumPy attributes via ``@`` notation, e.g., ``df.eval("@np.floor(a)")``. (:issue:`58041`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` which did not allow to use ``tan`` function. (:issue:`55091`) +- Bug in :meth:`DataFrame.query` which raised an exception or produced incorrect results when expressions contained backtick-quoted column names containing the hash character ``#``, backticks, or characters that fall outside the ASCII range (U+0001..U+007F). (:issue:`59285`) (:issue:`49633`) - Bug in :meth:`DataFrame.sort_index` when passing ``axis="columns"`` and ``ignore_index=True`` and ``ascending=False`` not returning a :class:`RangeIndex` columns (:issue:`57293`) - Bug in :meth:`DataFrame.transform` that was returning the wrong order unless the index was monotonically increasing. (:issue:`57069`) - Bug in :meth:`DataFrame.where` where using a non-bool type array in the function would return a ``ValueError`` instead of a ``TypeError`` (:issue:`56330`) diff --git a/pandas/core/computation/parsing.py b/pandas/core/computation/parsing.py index 8fbf8936d31ef..35a6d1c6ad269 100644 --- a/pandas/core/computation/parsing.py +++ b/pandas/core/computation/parsing.py @@ -4,6 +4,7 @@ from __future__ import annotations +from enum import Enum from io import StringIO from keyword import iskeyword import token @@ -32,13 +33,21 @@ def create_valid_python_identifier(name: str) -> str: ------ SyntaxError If the returned name is not a Python valid identifier, raise an exception. - This can happen if there is a hashtag in the name, as the tokenizer will - than terminate and not find the backtick. - But also for characters that fall out of the range of (U+0001..U+007F). """ if name.isidentifier() and not iskeyword(name): return name + # Escape characters that fall outside the ASCII range (U+0001..U+007F). + # GH 49633 + gen = ( + (c, "".join(chr(b) for b in c.encode("ascii", "backslashreplace"))) + for c in name + ) + name = "".join( + c_escaped.replace("\\", "_UNICODE_" if c != c_escaped else "_BACKSLASH_") + for c, c_escaped in gen + ) + # Create a dict with the special characters and their replacement string. # EXACT_TOKEN_TYPES contains these special characters # token.tok_name contains a readable description of the replacement string. @@ -54,11 +63,10 @@ def create_valid_python_identifier(name: str) -> str: "$": "_DOLLARSIGN_", "€": "_EUROSIGN_", "°": "_DEGREESIGN_", - # Including quotes works, but there are exceptions. "'": "_SINGLEQUOTE_", '"': "_DOUBLEQUOTE_", - # Currently not possible. Terminates parser and won't find backtick. - # "#": "_HASH_", + "#": "_HASH_", + "`": "_BACKTICK_", } ) @@ -127,6 +135,9 @@ def clean_column_name(name: Hashable) -> Hashable: which is not caught and propagates to the user level. """ try: + # Escape backticks + name = name.replace("`", "``") if isinstance(name, str) else name + tokenized = tokenize_string(f"`{name}`") tokval = next(tokenized)[1] return create_valid_python_identifier(tokval) @@ -168,6 +179,91 @@ def tokenize_backtick_quoted_string( return BACKTICK_QUOTED_STRING, source[string_start:string_end] +class ParseState(Enum): + DEFAULT = 0 + IN_BACKTICK = 1 + IN_SINGLE_QUOTE = 2 + IN_DOUBLE_QUOTE = 3 + + +def _split_by_backtick(s: str) -> list[tuple[bool, str]]: + """ + Splits a str into substrings along backtick characters (`). + + Disregards backticks inside quotes. + + Parameters + ---------- + s : str + The Python source code string. + + Returns + ------- + substrings: list[tuple[bool, str]] + List of tuples, where each tuple has two elements: + The first is a boolean indicating if the substring is backtick-quoted. + The second is the actual substring. + """ + substrings = [] + substr: list[str] = [] # Will join into a string before adding to `substrings` + i = 0 + parse_state = ParseState.DEFAULT + while i < len(s): + char = s[i] + + match char: + case "`": + # start of a backtick-quoted string + if parse_state == ParseState.DEFAULT: + if substr: + substrings.append((False, "".join(substr))) + + substr = [char] + i += 1 + parse_state = ParseState.IN_BACKTICK + continue + + elif parse_state == ParseState.IN_BACKTICK: + # escaped backtick inside a backtick-quoted string + next_char = s[i + 1] if (i != len(s) - 1) else None + if next_char == "`": + substr.append(char) + substr.append(next_char) + i += 2 + continue + + # end of the backtick-quoted string + else: + substr.append(char) + substrings.append((True, "".join(substr))) + + substr = [] + i += 1 + parse_state = ParseState.DEFAULT + continue + case "'": + # start of a single-quoted string + if parse_state == ParseState.DEFAULT: + parse_state = ParseState.IN_SINGLE_QUOTE + # end of a single-quoted string + elif (parse_state == ParseState.IN_SINGLE_QUOTE) and (s[i - 1] != "\\"): + parse_state = ParseState.DEFAULT + case '"': + # start of a double-quoted string + if parse_state == ParseState.DEFAULT: + parse_state = ParseState.IN_DOUBLE_QUOTE + # end of a double-quoted string + elif (parse_state == ParseState.IN_DOUBLE_QUOTE) and (s[i - 1] != "\\"): + parse_state = ParseState.DEFAULT + substr.append(char) + i += 1 + + if substr: + substrings.append((False, "".join(substr))) + + return substrings + + def tokenize_string(source: str) -> Iterator[tuple[int, str]]: """ Tokenize a Python source code string. @@ -182,18 +278,19 @@ def tokenize_string(source: str) -> Iterator[tuple[int, str]]: tok_generator : Iterator[Tuple[int, str]] An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]). """ + # GH 59285 + # Escape characters, including backticks + source = "".join( + ( + create_valid_python_identifier(substring[1:-1]) + if is_backtick_quoted + else substring + ) + for is_backtick_quoted, substring in _split_by_backtick(source) + ) + line_reader = StringIO(source).readline token_generator = tokenize.generate_tokens(line_reader) - # Loop over all tokens till a backtick (`) is found. - # Then, take all tokens till the next backtick to form a backtick quoted string - for toknum, tokval, start, _, _ in token_generator: - if tokval == "`": - try: - yield tokenize_backtick_quoted_string( - token_generator, source, string_start=start[1] + 1 - ) - except Exception as err: - raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err - else: - yield toknum, tokval + for toknum, tokval, _, _, _ in token_generator: + yield toknum, tokval diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 1e6608b0d87f3..b84fb33af26e5 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4556,17 +4556,8 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No quoted string are replaced by strings that are allowed as a Python identifier. These characters include all operators in Python, the space character, the question mark, the exclamation mark, the dollar sign, and the euro sign. - For other characters that fall outside the ASCII range (U+0001..U+007F) - and those that are not further specified in PEP 3131, - the query parser will raise an error. - This excludes whitespace different than the space character, - but also the hashtag (as it is used for comments) and the backtick - itself (backtick can also not be escaped). - - In a special case, quotes that make a pair around a backtick can - confuse the parser. - For example, ```it's` > `that's``` will raise an error, - as it forms a quoted string (``'s > `that'``) with a backtick inside. + + A backtick can be escaped by double backticks. See also the `Python documentation about lexical analysis `__ @@ -4620,6 +4611,7 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No raise ValueError(msg) kwargs["level"] = kwargs.pop("level", 0) + 1 kwargs["target"] = None + res = self.eval(expr, **kwargs) try: diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py index aa2fb19fe8528..fa71153d01157 100644 --- a/pandas/tests/frame/test_query_eval.py +++ b/pandas/tests/frame/test_query_eval.py @@ -1,4 +1,5 @@ import operator +from tokenize import TokenError import numpy as np import pytest @@ -1246,6 +1247,8 @@ def df(self): "it's": [6, 3, 1], "that's": [9, 1, 8], "☺": [8, 7, 6], + "xy (z)": [1, 2, 3], # noqa: RUF001 + "xy (z\\uff09": [4, 5, 6], # noqa: RUF001 "foo#bar": [2, 4, 5], 1: [5, 7, 9], } @@ -1341,20 +1344,160 @@ def test_missing_attribute(self, df): with pytest.raises(AttributeError, match=message): df.eval("@pd.thing") - def test_failing_quote(self, df): - msg = r"(Could not convert ).*( to a valid Python identifier.)" - with pytest.raises(SyntaxError, match=msg): - df.query("`it's` > `that's`") + def test_quote(self, df): + res = df.query("`it's` > `that's`") + expect = df[df["it's"] > df["that's"]] + tm.assert_frame_equal(res, expect) - def test_failing_character_outside_range(self, df): - msg = r"(Could not convert ).*( to a valid Python identifier.)" - with pytest.raises(SyntaxError, match=msg): - df.query("`☺` > 4") + def test_character_outside_range_smiley(self, df): + res = df.query("`☺` > 4") + expect = df[df["☺"] > 4] + tm.assert_frame_equal(res, expect) - def test_failing_hashtag(self, df): - msg = "Failed to parse backticks" - with pytest.raises(SyntaxError, match=msg): - df.query("`foo#bar` > 4") + def test_character_outside_range_2_byte_parens(self, df): + # GH 49633 + res = df.query("`xy (z)` == 2") # noqa: RUF001 + expect = df[df["xy (z)"] == 2] # noqa: RUF001 + tm.assert_frame_equal(res, expect) + + def test_character_outside_range_and_actual_backslash(self, df): + # GH 49633 + res = df.query("`xy (z\\uff09` == 2") # noqa: RUF001 + expect = df[df["xy \uff08z\\uff09"] == 2] + tm.assert_frame_equal(res, expect) + + def test_hashtag(self, df): + res = df.query("`foo#bar` > 4") + expect = df[df["foo#bar"] > 4] + tm.assert_frame_equal(res, expect) + + def test_expr_with_column_name_with_hashtag_character(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_comment(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a#"]) + result = df.query("`a#` < 2 # This is a comment") + expected = df[df["a#"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_column_name_with_backtick_and_hash(self): + # GH 59285 + df = DataFrame((1, 2, 3), columns=["a`#b"]) + result = df.query("`a``#b` < 2") + expected = df[df["a`#b"] < 2] + tm.assert_frame_equal(result, expected) + + def test_expr_with_column_name_with_backtick(self): + # GH 59285 + df = DataFrame({"a`b": (1, 2, 3), "ab": (4, 5, 6)}) + result = df.query("`a``b` < 2") # noqa + # Note: Formatting checks may wrongly consider the above ``inline code``. + expected = df[df["a`b"] < 2] + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_expr_with_string_with_backticks(self): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'```' < `#backticks`") + expected = df["```" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_expr_with_string_with_backticked_substring_same_as_column_name(self): + # GH 59285 + df = DataFrame(("`", "`````", "``````````"), columns=["#backticks"]) + result = df.query("'`#backticks`' < `#backticks`") + expected = df["`#backticks`" < df["#backticks"]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "col1,col2,expr", + [ + ("it's", "that's", "`it's` < `that's`"), + ('it"s', 'that"s', '`it"s` < `that"s`'), + ("it's", 'that\'s "nice"', "`it's` < `that's \"nice\"`"), + ("it's", "that's #cool", "`it's` < `that's #cool` # This is a comment"), + ], + ) + def test_expr_with_column_names_with_special_characters(self, col1, col2, expr): + # GH 59285 + df = DataFrame( + [ + {col1: 1, col2: 2}, + {col1: 3, col2: 4}, + {col1: -1, col2: -2}, + {col1: -3, col2: -4}, + ] + ) + result = df.query(expr) + expected = df[df[col1] < df[col2]] + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_expr_with_no_backticks(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column_name"]) + result = df.query("'value' < column_name") + expected = df["value" < df["column_name"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_no_quotes_and_backtick_is_unmatched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + with pytest.raises((SyntaxError, TokenError), match="invalid syntax"): + df.query("5 < `column-name") + + def test_expr_with_no_quotes_and_backtick_is_matched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["column-name"]) + result = df.query("5 < `column-name`") + expected = df[5 < df["column-name"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_backtick_opened_before_quote_and_backtick_is_unmatched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + with pytest.raises( + (SyntaxError, TokenError), match="unterminated string literal" + ): + df.query("5 < `It's") + + def test_expr_with_backtick_opened_before_quote_and_backtick_is_matched(self): + # GH 59285 + df = DataFrame((1, 5, 10), columns=["It's"]) + result = df.query("5 < `It's`") + expected = df[5 < df["It's"]] + tm.assert_frame_equal(result, expected) + + def test_expr_with_quote_opened_before_backtick_and_quote_is_unmatched(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + with pytest.raises( + (SyntaxError, TokenError), match="unterminated string literal" + ): + df.query("`column-name` < 'It`s that\\'s \"quote\" #hash") + + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_at_end(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("`column-name` < 'It`s that\\'s \"quote\" #hash'") + expected = df[df["column-name"] < 'It`s that\'s "quote" #hash'] + tm.assert_frame_equal(result, expected) + + @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") + def test_expr_with_quote_opened_before_backtick_and_quote_is_matched_in_mid(self): + # GH 59285 + df = DataFrame(("aaa", "vvv", "zzz"), columns=["column-name"]) + result = df.query("'It`s that\\'s \"quote\" #hash' < `column-name`") + expected = df['It`s that\'s "quote" #hash' < df["column-name"]] + tm.assert_frame_equal(result, expected) def test_call_non_named_expression(self, df): """ From 1044cf442109953987c1a47f476dc90d286b9f0f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 20 Aug 2024 10:32:23 -1000 Subject: [PATCH 08/14] CI: Uninstall nomkl & 32 bit Interval tests (#59553) * undo numpy 2 changes? * some interval 32 bit tests working * Revert "undo numpy 2 changes?" This reverts commit 39ce2229a96406edac107fc897e807251d364e2b. * nomkl? * nomkl? * Update .github/actions/build_pandas/action.yml * grep for nomkl * xfail WASM * Reverse condition --- .github/actions/build_pandas/action.yml | 7 +++++++ pandas/tests/indexes/interval/test_interval_tree.py | 7 +++++-- pandas/tests/indexing/interval/test_interval.py | 4 ++-- pandas/tests/indexing/interval/test_interval_new.py | 4 ++-- 4 files changed, 16 insertions(+), 6 deletions(-) diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 460ae2f8594c0..6eac6fcf84f51 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -22,6 +22,13 @@ runs: fi shell: bash -el {0} + - name: Uninstall nomkl + run: | + if conda list nomkl | grep nomkl 1>/dev/null; then + conda remove nomkl -y + fi + shell: bash -el {0} + - name: Build Pandas run: | if [[ ${{ inputs.editable }} == "true" ]]; then diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py index 49b17f8b3d40e..df9c3b390f660 100644 --- a/pandas/tests/indexes/interval/test_interval_tree.py +++ b/pandas/tests/indexes/interval/test_interval_tree.py @@ -4,7 +4,10 @@ import pytest from pandas._libs.interval import IntervalTree -from pandas.compat import IS64 +from pandas.compat import ( + IS64, + WASM, +) import pandas._testing as tm @@ -186,7 +189,7 @@ def test_construction_overflow(self): expected = (50 + np.iinfo(np.int64).max) / 2 assert result == expected - @pytest.mark.xfail(not IS64, reason="GH 23440") + @pytest.mark.xfail(WASM, reason="GH 23440") @pytest.mark.parametrize( "left, right, expected", [ diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py index b72ef57475305..6bcebefa6c696 100644 --- a/pandas/tests/indexing/interval/test_interval.py +++ b/pandas/tests/indexing/interval/test_interval.py @@ -2,7 +2,7 @@ import pytest from pandas._libs import index as libindex -from pandas.compat import IS64 +from pandas.compat import WASM import pandas as pd from pandas import ( @@ -210,7 +210,7 @@ def test_mi_intervalindex_slicing_with_scalar(self): expected = Series([1, 6, 2, 8, 7], index=expected_index, name="value") tm.assert_series_equal(result, expected) - @pytest.mark.xfail(not IS64, reason="GH 23440") + @pytest.mark.xfail(WASM, reason="GH 23440") @pytest.mark.parametrize("base", [101, 1010]) def test_reindex_behavior_with_interval_index(self, base): # GH 51826 diff --git a/pandas/tests/indexing/interval/test_interval_new.py b/pandas/tests/indexing/interval/test_interval_new.py index 4c1efe9e4f81d..051dc7b98f2aa 100644 --- a/pandas/tests/indexing/interval/test_interval_new.py +++ b/pandas/tests/indexing/interval/test_interval_new.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas.compat import IS64 +from pandas.compat import WASM from pandas import ( Index, @@ -214,7 +214,7 @@ def test_loc_getitem_missing_key_error_message( obj.loc[[4, 5, 6]] -@pytest.mark.xfail(not IS64, reason="GH 23440") +@pytest.mark.xfail(WASM, reason="GH 23440") @pytest.mark.parametrize( "intervals", [ From 7945e563d36bcf4694ccc44698829a6221905839 Mon Sep 17 00:00:00 2001 From: "Mien (Josephine) Nguyen" Date: Tue, 20 Aug 2024 19:46:22 -0400 Subject: [PATCH 09/14] DOC: Fix docstrings for Timestamp: strptime, timetz, to_datetime64, to_julian_date (#59569) * strptime * strptime + timestamp * timetuple * timetz * to_datetime64 * to_julian_date * timetz * code_checks * duplicate see also --- ci/code_checks.sh | 3 -- pandas/_libs/tslibs/nattype.pyx | 47 ++++++++++++++++++++++++++-- pandas/_libs/tslibs/timestamps.pyx | 50 +++++++++++++++++++++++++++--- 3 files changed, 90 insertions(+), 10 deletions(-) diff --git a/ci/code_checks.sh b/ci/code_checks.sh index ac163a3408c25..f1aeda9cbdc9d 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -191,9 +191,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.nanosecond GL08" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.second GL08" \ - -i "pandas.Timestamp.strptime PR01,SA01" \ - -i "pandas.Timestamp.timetz SA01" \ - -i "pandas.Timestamp.to_datetime64 SA01" \ -i "pandas.Timestamp.tzinfo GL08" \ -i "pandas.Timestamp.value GL08" \ -i "pandas.Timestamp.year GL08" \ diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index f5d671c5c42e9..41011ff13737a 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -229,7 +229,17 @@ cdef class _NaT(datetime): def to_datetime64(self) -> np.datetime64: """ - Return a numpy.datetime64 object with same precision. + Return a NumPy datetime64 object with same precision. + + This method returns a numpy.datetime64 object with the same + date and time information and precision as the pd.Timestamp object. + + See Also + -------- + numpy.datetime64 : Class to represent dates and times with high precision. + Timestamp.to_numpy : Alias for this method. + Timestamp.asm8 : Alias for this method. + pd.to_datetime : Convert argument to datetime. Examples -------- @@ -764,6 +774,19 @@ class NaTType(_NaT): """ Return time object with same time and tzinfo. + This method returns a datetime.time object with + the time and tzinfo corresponding to the pd.Timestamp + object, ignoring any information about the day/date. + + See Also + -------- + datetime.datetime.timetz : Return datetime.time object with the + same time attributes as the datetime object. + datetime.time : Class to represent the time of day, independent + of any particular day. + datetime.datetime.tzinfo : Attribute of datetime.datetime objects + representing the timezone of the datetime object. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -860,9 +883,27 @@ class NaTType(_NaT): strptime = _make_error_func( "strptime", """ - Timestamp.strptime(string, format) + Convert string argument to datetime. - Function is not implemented. Use pd.to_datetime(). + This method is not implemented; calling it will raise NotImplementedError. + Use pd.to_datetime() instead. + + Parameters + ---------- + date_string : str + String to convert to a datetime. + format : str, default None + The format string to parse time, e.g. "%d/%m/%Y". + + See Also + -------- + pd.to_datetime : Convert argument to datetime. + datetime.datetime.strptime : Return a datetime corresponding to a string + representing a date and time, parsed according to a separate + format string. + datetime.datetime.strftime : Return a string representing the date and + time, controlled by an explicit format string. + Timestamp.isoformat : Return the time formatted according to ISO 8601. Examples -------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 494dbecfe86b5..3268207b667f2 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -1342,7 +1342,17 @@ cdef class _Timestamp(ABCTimestamp): cpdef to_datetime64(self): """ - Return a numpy.datetime64 object with same precision. + Return a NumPy datetime64 object with same precision. + + This method returns a numpy.datetime64 object with the same + date and time information and precision as the pd.Timestamp object. + + See Also + -------- + numpy.datetime64 : Class to represent dates and times with high precision. + Timestamp.to_numpy : Alias for this method. + Timestamp.asm8 : Alias for this method. + pd.to_datetime : Convert argument to datetime. Examples -------- @@ -2093,6 +2103,19 @@ class Timestamp(_Timestamp): """ Return time object with same time and tzinfo. + This method returns a datetime.time object with + the time and tzinfo corresponding to the pd.Timestamp + object, ignoring any information about the day/date. + + See Also + -------- + datetime.datetime.timetz : Return datetime.time object with the + same time attributes as the datetime object. + datetime.time : Class to represent the time of day, independent + of any particular day. + datetime.datetime.tzinfo : Attribute of datetime.datetime objects + representing the timezone of the datetime object. + Examples -------- >>> ts = pd.Timestamp('2023-01-01 10:00:00', tz='Europe/Brussels') @@ -2141,9 +2164,27 @@ class Timestamp(_Timestamp): @classmethod def strptime(cls, date_string, format): """ - Timestamp.strptime(string, format) + Convert string argument to datetime. - Function is not implemented. Use pd.to_datetime(). + This method is not implemented; calling it will raise NotImplementedError. + Use pd.to_datetime() instead. + + Parameters + ---------- + date_string : str + String to convert to a datetime. + format : str, default None + The format string to parse time, e.g. "%d/%m/%Y". + + See Also + -------- + pd.to_datetime : Convert argument to datetime. + datetime.datetime.strptime : Return a datetime corresponding to a string + representing a date and time, parsed according to a separate + format string. + datetime.datetime.strftime : Return a string representing the date and + time, controlled by an explicit format string. + Timestamp.isoformat : Return the time formatted according to ISO 8601. Examples -------- @@ -3073,7 +3114,8 @@ default 'raise' """ Convert TimeStamp to a Julian Date. - 0 Julian date is noon January 1, 4713 BC. + This method returns the number of days as a float since + 0 Julian date, which is noon January 1, 4713 BC. See Also -------- From e1a79b299d4db379d5a36f2376cf6ac00a712aa7 Mon Sep 17 00:00:00 2001 From: messense Date: Thu, 22 Aug 2024 01:34:54 +0800 Subject: [PATCH 10/14] PERF: avoid calling `DataFrame.dtypes` in loop (#59573) --- pandas/core/generic.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0f0078fc3398b..8cccb7001eb7b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -600,9 +600,10 @@ def _get_cleaned_column_resolvers(self) -> dict[Hashable, Series]: if isinstance(self, ABCSeries): return {clean_column_name(self.name): self} + dtypes = self.dtypes return { clean_column_name(k): Series( - v, copy=False, index=self.index, name=k, dtype=self.dtypes[k] + v, copy=False, index=self.index, name=k, dtype=dtypes[k] ).__finalize__(self) for k, v in zip(self.columns, self._iter_column_arrays()) if not isinstance(k, int) From 320d6133f75a5bc042fa7d68437d79fc72d4ec48 Mon Sep 17 00:00:00 2001 From: Florian Bourgey Date: Wed, 21 Aug 2024 15:30:09 -0400 Subject: [PATCH 11/14] Series replace error (#59552) * changed AttributeError to ValueError * added test for replace method from dict_like to dict_like for a pd.Series * added entry in whatsnew.rst * Update doc/source/whatsnew/v3.0.0.rst removed rst entry Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/generic.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * Update pandas/core/generic.py Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> * updated msg test function * breaking line for 88 instead of 89 characters --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/core/generic.py | 6 +++++- pandas/tests/series/methods/test_replace.py | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8cccb7001eb7b..8e2524c21d31e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -7487,9 +7487,13 @@ def replace( if inplace: return None return self.copy(deep=False) - if is_dict_like(to_replace): if is_dict_like(value): # {'A' : NA} -> {'A' : 0} + if isinstance(self, ABCSeries): + raise ValueError( + "to_replace and value cannot be dict-like for " + "Series.replace" + ) # Note: Checking below for `in foo.keys()` instead of # `in foo` is needed for when we have a Series and not dict mapping = { diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py index de0855bf7192e..611fcc114db6c 100644 --- a/pandas/tests/series/methods/test_replace.py +++ b/pandas/tests/series/methods/test_replace.py @@ -496,6 +496,15 @@ def test_replace_only_one_dictlike_arg(self, fixed_now_ts): with pytest.raises(ValueError, match=msg): ser.replace(to_replace, value) + def test_replace_dict_like_with_dict_like(self): + # GH 59452 + s = pd.Series([1, 2, 3, 4, 5]) + to_replace = pd.Series([1]) + value = pd.Series([75]) + msg = "to_replace and value cannot be dict-like for Series.replace" + with pytest.raises(ValueError, match=msg): + s.replace(to_replace, value) + def test_replace_extension_other(self, frame_or_series): # https://github.com/pandas-dev/pandas/issues/34530 obj = frame_or_series(pd.array([1, 2, 3], dtype="Int64")) From 851639d279eda81451a3f8c86c8c2882cf7461e5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 21 Aug 2024 21:31:40 +0200 Subject: [PATCH 12/14] String dtype: still return nullable NA-variant in object inference (`maybe_converts_object`) if requested (#59487) * String dtype: maybe_converts_object give precedence to nullable dtype * update datetimelike input validation * update tests and remove xfails * explicitly test pd.array() behaviour (remove xfail) * fixup allow_2d * undo changes related to datetimelike input validation * fix test for str on current main --------- Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/_libs/lib.pyx | 8 ++--- .../tests/arrays/string_/test_string_arrow.py | 5 ++- pandas/tests/arrays/test_array.py | 34 +++++++++++++++++-- pandas/tests/arrays/test_datetimelike.py | 7 ++-- pandas/tests/base/test_value_counts.py | 4 +-- .../dtypes/cast/test_construct_ndarray.py | 2 +- .../io/parser/usecols/test_usecols_basic.py | 3 -- 7 files changed, 43 insertions(+), 20 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 489d4fa111d40..e1a2a0142c52e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -2699,16 +2699,16 @@ def maybe_convert_objects(ndarray[object] objects, seen.object_ = True elif seen.str_: - if using_string_dtype() and is_string_array(objects, skipna=True): + if convert_to_nullable_dtype and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype(na_value=np.nan) + dtype = StringDtype() return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) - elif convert_to_nullable_dtype and is_string_array(objects, skipna=True): + elif using_string_dtype() and is_string_array(objects, skipna=True): from pandas.core.arrays.string_ import StringDtype - dtype = StringDtype() + dtype = StringDtype(na_value=np.nan) return dtype.construct_array_type()._from_sequence(objects, dtype=dtype) seen.object_ = True diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py index cfba32c62f206..b042cf632288b 100644 --- a/pandas/tests/arrays/string_/test_string_arrow.py +++ b/pandas/tests/arrays/string_/test_string_arrow.py @@ -36,9 +36,8 @@ def test_config(string_storage, using_infer_string): result = pd.array(["a", "b"]) assert result.dtype.storage == string_storage - dtype = StringDtype( - string_storage, na_value=np.nan if using_infer_string else pd.NA - ) + # pd.array(..) by default always returns the NA-variant + dtype = StringDtype(string_storage, na_value=pd.NA) expected = dtype.construct_array_type()._from_sequence(["a", "b"], dtype=dtype) tm.assert_equal(result, expected) diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 76b8928f28b65..4070a2844846f 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -215,6 +215,15 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + "str", + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)) + if using_string_dtype() + else NumpyExtensionArray(np.array(["a", "None"])), + ), ( ["a", None], pd.StringDtype(), @@ -222,14 +231,29 @@ def test_dt64_array(dtype_unit): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + ["a", None], + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", None], dtype=pd.StringDtype(na_value=np.nan)), + ), ( # numpy array with string dtype np.array(["a", "b"], dtype=str), - None, + pd.StringDtype(), pd.StringDtype() .construct_array_type() ._from_sequence(["a", "b"], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype(na_value=np.nan), + pd.StringDtype(na_value=np.nan) + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype(na_value=np.nan)), + ), # Boolean ( [True, None], @@ -287,7 +311,6 @@ def test_array_copy(): assert tm.shares_memory(a, b) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize( "data, expected", [ @@ -387,6 +410,13 @@ def test_array_copy(): .construct_array_type() ._from_sequence(["a", None], dtype=pd.StringDtype()), ), + ( + # numpy array with string dtype + np.array(["a", "b"], dtype=str), + pd.StringDtype() + .construct_array_type() + ._from_sequence(["a", "b"], dtype=pd.StringDtype()), + ), # Boolean ([True, False], BooleanArray._from_sequence([True, False], dtype="boolean")), ([True, None], BooleanArray._from_sequence([True, None], dtype="boolean")), diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py index 59ff4f3122e8f..6dd1ef9d59ab4 100644 --- a/pandas/tests/arrays/test_datetimelike.py +++ b/pandas/tests/arrays/test_datetimelike.py @@ -297,9 +297,7 @@ def test_searchsorted(self): assert result == 10 @pytest.mark.parametrize("box", [None, "index", "series"]) - def test_searchsorted_castable_strings( - self, arr1d, box, string_storage, using_infer_string - ): + def test_searchsorted_castable_strings(self, arr1d, box, string_storage): arr = arr1d if box is None: pass @@ -335,8 +333,7 @@ def test_searchsorted_castable_strings( TypeError, match=re.escape( f"value should be a '{arr1d._scalar_type.__name__}', 'NaT', " - "or array of those. Got " - f"{'str' if using_infer_string else 'string'} array instead." + "or array of those. Got string array instead." ), ): arr.searchsorted([str(arr[1]), "baz"]) diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py index c72abfeb9f3e7..bcb31829a201f 100644 --- a/pandas/tests/base/test_value_counts.py +++ b/pandas/tests/base/test_value_counts.py @@ -114,7 +114,7 @@ def test_value_counts_inferred(index_or_series, using_infer_string): else: exp = np.unique(np.array(s_values, dtype=np.object_)) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 4 @@ -192,7 +192,7 @@ def test_value_counts_bins(index_or_series, using_infer_string): else: exp = np.array(["a", "b", np.nan, "d"], dtype=object) if using_infer_string: - exp = array(exp) + exp = array(exp, dtype="str") tm.assert_equal(s.unique(), exp) assert s.nunique() == 3 diff --git a/pandas/tests/dtypes/cast/test_construct_ndarray.py b/pandas/tests/dtypes/cast/test_construct_ndarray.py index ab468c81124bc..6b9b2dfda6e8b 100644 --- a/pandas/tests/dtypes/cast/test_construct_ndarray.py +++ b/pandas/tests/dtypes/cast/test_construct_ndarray.py @@ -21,7 +21,7 @@ def test_construct_1d_ndarray_preserving_na( ): result = sanitize_array(values, index=None, dtype=dtype) if using_infer_string and expected.dtype == object and dtype is None: - tm.assert_extension_array_equal(result, pd.array(expected)) + tm.assert_extension_array_equal(result, pd.array(expected, dtype="str")) else: tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index d02364a77df90..82b42beb38ae0 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -8,8 +8,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas.errors import ParserError from pandas import ( @@ -531,7 +529,6 @@ def test_usecols_additional_columns_integer_columns(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") def test_usecols_dtype(all_parsers): parser = all_parsers data = """ From 13c4069b05dd1fc3fce878a420ea7ed11126ce5d Mon Sep 17 00:00:00 2001 From: Memoona Shah Date: Wed, 21 Aug 2024 21:33:02 +0200 Subject: [PATCH 13/14] =?UTF-8?q?DOC:=20Clarify=20date=5Fformat=20usage=20?= =?UTF-8?q?in=20read=5Fcsv=20documentation=20specifically=E2=80=A6=20(#595?= =?UTF-8?q?66)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DOC: Clarify date_format usage in read_csv documentation specifically pandas/io/parsers/readers.py issue #59557 Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> --- pandas/io/parsers/readers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 6e933f94cf0ba..2916e4d98cce4 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -321,7 +321,7 @@ class _read_shared(TypedDict, Generic[HashableT], total=False): Note: A fast-path exists for iso8601-formatted dates. date_format : str or dict of column -> format, optional - Format to use for parsing dates when used in conjunction with ``parse_dates``. + Format to use for parsing dates and/or times when used in conjunction with ``parse_dates``. The strftime to parse time, e.g. :const:`"%d/%m/%Y"`. See `strftime documentation Date: Wed, 21 Aug 2024 14:06:37 -0700 Subject: [PATCH 14/14] BUG: Handle nonexistent end dates when resampling (#59471) --- doc/source/whatsnew/v3.0.0.rst | 1 + pandas/core/resample.py | 2 +- pandas/tests/resample/test_datetime_index.py | 13 +++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bdeb9c48990a2..e56f3ffc01e85 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -650,6 +650,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.cumsum` where it did not return the correct dtype when the label contained ``None``. (:issue:`58811`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) - Bug in :meth:`Rolling.apply` where the applied function could be called on fewer than ``min_period`` periods if ``method="table"``. (:issue:`58868`) +- Bug in :meth:`Series.resample` could raise when the the date range ended shortly before a non-existent time. (:issue:`58380`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8ee71ea2293e6..b621fcf9a6415 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -2466,7 +2466,7 @@ def _get_timestamp_range_edges( ) if isinstance(freq, Day): first = first.tz_localize(index_tz) - last = last.tz_localize(index_tz) + last = last.tz_localize(index_tz, nonexistent="shift_forward") else: first = first.normalize() last = last.normalize() diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py index dc2ddcc70828f..179f2c0e6cfa9 100644 --- a/pandas/tests/resample/test_datetime_index.py +++ b/pandas/tests/resample/test_datetime_index.py @@ -958,6 +958,19 @@ def _create_series(values, timestamps, freq="D"): tm.assert_series_equal(result, expected) +def test_resample_dst_midnight_last_nonexistent(): + # GH 58380 + ts = Series( + 1, + date_range("2024-04-19", "2024-04-20", tz="Africa/Cairo", freq="15min"), + ) + + expected = Series([len(ts)], index=DatetimeIndex([ts.index[0]], freq="7D")) + + result = ts.resample("7D").sum() + tm.assert_series_equal(result, expected) + + def test_resample_daily_anchored(unit): rng = date_range("1/1/2000 0:00:00", periods=10000, freq="min").as_unit(unit) ts = Series(np.random.default_rng(2).standard_normal(len(rng)), index=rng)